summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kbuild1
-rw-r--r--arch/x86/Kconfig224
-rw-r--r--arch/x86/Kconfig.cpu19
-rw-r--r--arch/x86/Kconfig.debug20
-rw-r--r--arch/x86/Makefile_32.cpu2
-rw-r--r--arch/x86/boot/Makefile9
-rw-r--r--arch/x86/boot/memory.c2
-rw-r--r--arch/x86/boot/tools/build.c33
-rw-r--r--arch/x86/configs/i386_defconfig1
-rw-r--r--arch/x86/configs/x86_64_defconfig1
-rw-r--r--arch/x86/crypto/Makefile4
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c9
-rw-r--r--arch/x86/crypto/fpu.c10
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c2
-rw-r--r--arch/x86/ia32/ia32_signal.c22
-rw-r--r--arch/x86/ia32/ia32entry.S12
-rw-r--r--arch/x86/ia32/sys_ia32.c2
-rw-r--r--arch/x86/include/asm/acpi.h4
-rw-r--r--arch/x86/include/asm/alternative-asm.h9
-rw-r--r--arch/x86/include/asm/alternative.h17
-rw-r--r--arch/x86/include/asm/amd_iommu.h35
-rw-r--r--arch/x86/include/asm/amd_iommu_proto.h47
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h560
-rw-r--r--arch/x86/include/asm/amd_nb.h1
-rw-r--r--arch/x86/include/asm/apb_timer.h25
-rw-r--r--arch/x86/include/asm/apic.h40
-rw-r--r--arch/x86/include/asm/apicdef.h1
-rw-r--r--arch/x86/include/asm/asm.h5
-rw-r--r--arch/x86/include/asm/atomic.h10
-rw-r--r--arch/x86/include/asm/atomic64_32.h2
-rw-r--r--arch/x86/include/asm/atomic64_64.h2
-rw-r--r--arch/x86/include/asm/bios_ebda.h28
-rw-r--r--arch/x86/include/asm/bitops.h5
-rw-r--r--arch/x86/include/asm/calling.h130
-rw-r--r--arch/x86/include/asm/clocksource.h18
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h48
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h45
-rw-r--r--arch/x86/include/asm/cpufeature.h27
-rw-r--r--arch/x86/include/asm/delay.h25
-rw-r--r--arch/x86/include/asm/desc.h152
-rw-r--r--arch/x86/include/asm/dma.h12
-rw-r--r--arch/x86/include/asm/efi.h1
-rw-r--r--arch/x86/include/asm/entry_arch.h4
-rw-r--r--arch/x86/include/asm/fixmap.h1
-rw-r--r--arch/x86/include/asm/frame.h11
-rw-r--r--arch/x86/include/asm/ftrace.h7
-rw-r--r--arch/x86/include/asm/gart.h24
-rw-r--r--arch/x86/include/asm/hw_irq.h3
-rw-r--r--arch/x86/include/asm/i8253.h18
-rw-r--r--arch/x86/include/asm/idle.h2
-rw-r--r--arch/x86/include/asm/io.h24
-rw-r--r--arch/x86/include/asm/io_apic.h30
-rw-r--r--arch/x86/include/asm/irq_vectors.h11
-rw-r--r--arch/x86/include/asm/irqflags.h11
-rw-r--r--arch/x86/include/asm/jump_label.h27
-rw-r--r--arch/x86/include/asm/kdebug.h1
-rw-r--r--arch/x86/include/asm/kgdb.h1
-rw-r--r--arch/x86/include/asm/kvm_emulate.h243
-rw-r--r--arch/x86/include/asm/kvm_host.h99
-rw-r--r--arch/x86/include/asm/kvm_para.h20
-rw-r--r--arch/x86/include/asm/lguest_hcall.h1
-rw-r--r--arch/x86/include/asm/linkage.h5
-rw-r--r--arch/x86/include/asm/local.h2
-rw-r--r--arch/x86/include/asm/mce.h23
-rw-r--r--arch/x86/include/asm/memblock.h2
-rw-r--r--arch/x86/include/asm/mmu.h4
-rw-r--r--arch/x86/include/asm/mmu_context.h2
-rw-r--r--arch/x86/include/asm/mmzone_32.h39
-rw-r--r--arch/x86/include/asm/mmzone_64.h26
-rw-r--r--arch/x86/include/asm/module.h2
-rw-r--r--arch/x86/include/asm/msr-index.h16
-rw-r--r--arch/x86/include/asm/nops.h146
-rw-r--r--arch/x86/include/asm/numa.h34
-rw-r--r--arch/x86/include/asm/numa_32.h10
-rw-r--r--arch/x86/include/asm/numa_64.h36
-rw-r--r--arch/x86/include/asm/numaq.h7
-rw-r--r--arch/x86/include/asm/olpc.h51
-rw-r--r--arch/x86/include/asm/olpc_ofw.h9
-rw-r--r--arch/x86/include/asm/paravirt.h9
-rw-r--r--arch/x86/include/asm/paravirt_types.h1
-rw-r--r--arch/x86/include/asm/pci.h2
-rw-r--r--arch/x86/include/asm/percpu.h45
-rw-r--r--arch/x86/include/asm/perf_event.h5
-rw-r--r--arch/x86/include/asm/perf_event_p4.h33
-rw-r--r--arch/x86/include/asm/pgtable_types.h7
-rw-r--r--arch/x86/include/asm/probe_roms.h8
-rw-r--r--arch/x86/include/asm/processor-flags.h2
-rw-r--r--arch/x86/include/asm/processor.h4
-rw-r--r--arch/x86/include/asm/prom.h13
-rw-r--r--arch/x86/include/asm/ptrace.h18
-rw-r--r--arch/x86/include/asm/pvclock.h9
-rw-r--r--arch/x86/include/asm/rwlock.h43
-rw-r--r--arch/x86/include/asm/segment.h2
-rw-r--r--arch/x86/include/asm/setup.h4
-rw-r--r--arch/x86/include/asm/smpboot_hooks.h8
-rw-r--r--arch/x86/include/asm/spinlock.h39
-rw-r--r--arch/x86/include/asm/spinlock_types.h6
-rw-r--r--arch/x86/include/asm/srat.h39
-rw-r--r--arch/x86/include/asm/stacktrace.h3
-rw-r--r--arch/x86/include/asm/suspend_32.h2
-rw-r--r--arch/x86/include/asm/suspend_64.h5
-rw-r--r--arch/x86/include/asm/system.h85
-rw-r--r--arch/x86/include/asm/thread_info.h2
-rw-r--r--arch/x86/include/asm/time.h6
-rw-r--r--arch/x86/include/asm/topology.h8
-rw-r--r--arch/x86/include/asm/traps.h4
-rw-r--r--arch/x86/include/asm/uaccess.h6
-rw-r--r--arch/x86/include/asm/uaccess_32.h1
-rw-r--r--arch/x86/include/asm/uaccess_64.h1
-rw-r--r--arch/x86/include/asm/unistd_32.h4
-rw-r--r--arch/x86/include/asm/unistd_64.h4
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h600
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h73
-rw-r--r--arch/x86/include/asm/uv/uv_mmrs.h2513
-rw-r--r--arch/x86/include/asm/vdso.h14
-rw-r--r--arch/x86/include/asm/vgtod.h5
-rw-r--r--arch/x86/include/asm/vmx.h43
-rw-r--r--arch/x86/include/asm/vsyscall.h16
-rw-r--r--arch/x86/include/asm/vvar.h50
-rw-r--r--arch/x86/include/asm/x2apic.h62
-rw-r--r--arch/x86/include/asm/x86_init.h12
-rw-r--r--arch/x86/include/asm/xen/hypercall.h29
-rw-r--r--arch/x86/include/asm/xen/page.h5
-rw-r--r--arch/x86/include/asm/xen/pci.h21
-rw-r--r--arch/x86/include/asm/xen/trace_types.h18
-rw-r--r--arch/x86/kernel/Makefile7
-rw-r--r--arch/x86/kernel/acpi/boot.c8
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.S14
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.h6
-rw-r--r--arch/x86/kernel/acpi/sleep.c11
-rw-r--r--arch/x86/kernel/alternative.c224
-rw-r--r--arch/x86/kernel/amd_gart_64.c (renamed from arch/x86/kernel/pci-gart_64.c)11
-rw-r--r--arch/x86/kernel/amd_iommu.c2635
-rw-r--r--arch/x86/kernel/amd_iommu_init.c1540
-rw-r--r--arch/x86/kernel/apb_timer.c418
-rw-r--r--arch/x86/kernel/aperture_64.c36
-rw-r--r--arch/x86/kernel/apic/Makefile17
-rw-r--r--arch/x86/kernel/apic/apic.c149
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c26
-rw-r--r--arch/x86/kernel/apic/apic_noop.c9
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c13
-rw-r--r--arch/x86/kernel/apic/es7000_32.c19
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c4
-rw-r--r--arch/x86/kernel/apic/io_apic.c410
-rw-r--r--arch/x86/kernel/apic/numaq_32.c40
-rw-r--r--arch/x86/kernel/apic/probe_32.c118
-rw-r--r--arch/x86/kernel/apic/probe_64.c61
-rw-r--r--arch/x86/kernel/apic/summit_32.c5
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c222
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c115
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c109
-rw-r--r--arch/x86/kernel/apm_32.c19
-rw-r--r--arch/x86/kernel/asm-offsets_32.c1
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/amd.c16
-rw-r--r--arch/x86/kernel/cpu/bugs.c5
-rw-r--r--arch/x86/kernel/cpu/common.c37
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig266
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Makefile21
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c776
-rw-r--r--arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c446
-rw-r--r--arch/x86/kernel/cpu/cpufreq/e_powersaver.c367
-rw-r--r--arch/x86/kernel/cpu/cpufreq/elanfreq.c309
-rw-r--r--arch/x86/kernel/cpu/cpufreq/gx-suspmod.c517
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c1029
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.h353
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longrun.c327
-rw-r--r--arch/x86/kernel/cpu/cpufreq/mperf.c51
-rw-r--r--arch/x86/kernel/cpu/cpufreq/mperf.h9
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c331
-rw-r--r--arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c624
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k6.c261
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c752
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.h43
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c1607
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.h224
-rw-r--r--arch/x86/kernel/cpu/cpufreq/sc520_freq.c194
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c636
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c452
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.c481
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.h49
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-smi.c467
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c4
-rw-r--r--arch/x86/kernel/cpu/intel.c47
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c20
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c152
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c313
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c11
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c22
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c182
-rw-r--r--arch/x86/kernel/cpu/perf_event.c218
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c50
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c543
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c4
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c145
-rw-r--r--arch/x86/kernel/devicetree.c79
-rw-r--r--arch/x86/kernel/dumpstack.c17
-rw-r--r--arch/x86/kernel/dumpstack_64.c37
-rw-r--r--arch/x86/kernel/entry_64.S84
-rw-r--r--arch/x86/kernel/ftrace.c16
-rw-r--r--arch/x86/kernel/head32.c1
-rw-r--r--arch/x86/kernel/hpet.c86
-rw-r--r--arch/x86/kernel/i387.c2
-rw-r--r--arch/x86/kernel/i8253.c183
-rw-r--r--arch/x86/kernel/i8259.c2
-rw-r--r--arch/x86/kernel/irq.c5
-rw-r--r--arch/x86/kernel/irqinit.c5
-rw-r--r--arch/x86/kernel/jump_label.c5
-rw-r--r--arch/x86/kernel/kgdb.c4
-rw-r--r--arch/x86/kernel/kprobes.c5
-rw-r--r--arch/x86/kernel/kvm.c72
-rw-r--r--arch/x86/kernel/kvmclock.c8
-rw-r--r--arch/x86/kernel/microcode_amd.c21
-rw-r--r--arch/x86/kernel/module.c38
-rw-r--r--arch/x86/kernel/mpparse.c8
-rw-r--r--arch/x86/kernel/paravirt.c9
-rw-r--r--arch/x86/kernel/pci-calgary_64.c2
-rw-r--r--arch/x86/kernel/pci-dma.c64
-rw-r--r--arch/x86/kernel/pci-iommu_table.c18
-rw-r--r--arch/x86/kernel/probe_roms.c (renamed from arch/x86/kernel/probe_roms_32.c)103
-rw-r--r--arch/x86/kernel/process.c45
-rw-r--r--arch/x86/kernel/process_32.c1
-rw-r--r--arch/x86/kernel/process_64.c1
-rw-r--r--arch/x86/kernel/ptrace.c45
-rw-r--r--arch/x86/kernel/quirks.c5
-rw-r--r--arch/x86/kernel/reboot.c56
-rw-r--r--arch/x86/kernel/reboot_32.S12
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S2
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S2
-rw-r--r--arch/x86/kernel/setup.c16
-rw-r--r--arch/x86/kernel/signal.c70
-rw-r--r--arch/x86/kernel/smp.c5
-rw-r--r--arch/x86/kernel/smpboot.c44
-rw-r--r--arch/x86/kernel/stacktrace.c15
-rw-r--r--arch/x86/kernel/syscall_table_32.S2
-rw-r--r--arch/x86/kernel/tboot.c2
-rw-r--r--arch/x86/kernel/test_nx.c2
-rw-r--r--arch/x86/kernel/time.c4
-rw-r--r--arch/x86/kernel/traps.c8
-rw-r--r--arch/x86/kernel/tsc.c45
-rw-r--r--arch/x86/kernel/vmlinux.lds.S72
-rw-r--r--arch/x86/kernel/vsyscall_64.c322
-rw-r--r--arch/x86/kernel/vsyscall_emu_64.S27
-rw-r--r--arch/x86/kernel/x86_init.c6
-rw-r--r--arch/x86/kvm/Kconfig2
-rw-r--r--arch/x86/kvm/emulate.c3135
-rw-r--r--arch/x86/kvm/i8254.h2
-rw-r--r--arch/x86/kvm/irq.h2
-rw-r--r--arch/x86/kvm/lapic.c2
-rw-r--r--arch/x86/kvm/mmu.c1248
-rw-r--r--arch/x86/kvm/mmu.h25
-rw-r--r--arch/x86/kvm/mmu_audit.c12
-rw-r--r--arch/x86/kvm/mmutrace.h48
-rw-r--r--arch/x86/kvm/paging_tmpl.h293
-rw-r--r--arch/x86/kvm/svm.c591
-rw-r--r--arch/x86/kvm/timer.c2
-rw-r--r--arch/x86/kvm/trace.h31
-rw-r--r--arch/x86/kvm/vmx.c3015
-rw-r--r--arch/x86/kvm/x86.c902
-rw-r--r--arch/x86/kvm/x86.h46
-rw-r--r--arch/x86/lguest/boot.c43
-rw-r--r--arch/x86/lguest/i386_head.S35
-rw-r--r--arch/x86/lib/Makefile9
-rw-r--r--arch/x86/lib/atomic64_32.c2
-rw-r--r--arch/x86/lib/clear_page_64.S33
-rw-r--r--arch/x86/lib/copy_page_64.S9
-rw-r--r--arch/x86/lib/copy_user_64.S69
-rw-r--r--arch/x86/lib/memcpy_64.S47
-rw-r--r--arch/x86/lib/memmove_64.S28
-rw-r--r--arch/x86/lib/memset_64.S54
-rw-r--r--arch/x86/lib/rwlock.S44
-rw-r--r--arch/x86/lib/rwlock_64.S38
-rw-r--r--arch/x86/lib/rwsem.S (renamed from arch/x86/lib/rwsem_64.S)75
-rw-r--r--arch/x86/lib/semaphore_32.S124
-rw-r--r--arch/x86/lib/thunk_64.S45
-rw-r--r--arch/x86/lib/usercopy.c43
-rw-r--r--arch/x86/mm/Makefile4
-rw-r--r--arch/x86/mm/amdtopology.c (renamed from arch/x86/mm/amdtopology_64.c)21
-rw-r--r--arch/x86/mm/fault.c34
-rw-r--r--arch/x86/mm/hugetlbpage.c4
-rw-r--r--arch/x86/mm/init.c26
-rw-r--r--arch/x86/mm/init_32.c3
-rw-r--r--arch/x86/mm/init_64.c13
-rw-r--r--arch/x86/mm/ioremap.c14
-rw-r--r--arch/x86/mm/kmemcheck/error.c2
-rw-r--r--arch/x86/mm/memblock.c4
-rw-r--r--arch/x86/mm/mmio-mod.c2
-rw-r--r--arch/x86/mm/numa.c596
-rw-r--r--arch/x86/mm/numa_32.c404
-rw-r--r--arch/x86/mm/numa_64.c644
-rw-r--r--arch/x86/mm/numa_emulation.c36
-rw-r--r--arch/x86/mm/numa_internal.h8
-rw-r--r--arch/x86/mm/pageattr-test.c3
-rw-r--r--arch/x86/mm/pf_in.c14
-rw-r--r--arch/x86/mm/srat.c (renamed from arch/x86/mm/srat_64.c)82
-rw-r--r--arch/x86/mm/srat_32.c288
-rw-r--r--arch/x86/net/Makefile4
-rw-r--r--arch/x86/net/bpf_jit.S140
-rw-r--r--arch/x86/net/bpf_jit_comp.c654
-rw-r--r--arch/x86/oprofile/backtrace.c34
-rw-r--r--arch/x86/oprofile/nmi_int.c14
-rw-r--r--arch/x86/oprofile/op_model_amd.c100
-rw-r--r--arch/x86/pci/acpi.c9
-rw-r--r--arch/x86/pci/ce4100.c2
-rw-r--r--arch/x86/pci/common.c14
-rw-r--r--arch/x86/pci/direct.c23
-rw-r--r--arch/x86/pci/irq.c4
-rw-r--r--arch/x86/pci/mmconfig-shared.c13
-rw-r--r--arch/x86/pci/numaq_32.c2
-rw-r--r--arch/x86/pci/olpc.c4
-rw-r--r--arch/x86/pci/pcbios.c2
-rw-r--r--arch/x86/pci/visws.c2
-rw-r--r--arch/x86/pci/xen.c469
-rw-r--r--arch/x86/platform/ce4100/falconfalls.dts6
-rw-r--r--arch/x86/platform/efi/efi.c231
-rw-r--r--arch/x86/platform/efi/efi_64.c37
-rw-r--r--arch/x86/platform/mrst/mrst.c4
-rw-r--r--arch/x86/platform/olpc/Makefile9
-rw-r--r--arch/x86/platform/olpc/olpc-xo1-pm.c215
-rw-r--r--arch/x86/platform/olpc/olpc-xo1-rtc.c81
-rw-r--r--arch/x86/platform/olpc/olpc-xo1-sci.c614
-rw-r--r--arch/x86/platform/olpc/olpc-xo1.c146
-rw-r--r--arch/x86/platform/olpc/olpc-xo15-sci.c168
-rw-r--r--arch/x86/platform/olpc/olpc.c142
-rw-r--r--arch/x86/platform/olpc/olpc_dt.c122
-rw-r--r--arch/x86/platform/olpc/xo1-wakeup.S124
-rw-r--r--arch/x86/platform/uv/tlb_uv.c1531
-rw-r--r--arch/x86/platform/uv/uv_time.c22
-rw-r--r--arch/x86/vdso/Makefile18
-rw-r--r--arch/x86/vdso/vclock_gettime.c141
-rw-r--r--arch/x86/vdso/vdso.S15
-rw-r--r--arch/x86/vdso/vdso.lds.S9
-rw-r--r--arch/x86/vdso/vextern.h16
-rw-r--r--arch/x86/vdso/vgetcpu.c3
-rw-r--r--arch/x86/vdso/vma.c77
-rw-r--r--arch/x86/vdso/vvar.c12
-rw-r--r--arch/x86/xen/Makefile4
-rw-r--r--arch/x86/xen/enlighten.c53
-rw-r--r--arch/x86/xen/irq.c2
-rw-r--r--arch/x86/xen/mmu.c491
-rw-r--r--arch/x86/xen/mmu.h37
-rw-r--r--arch/x86/xen/multicalls.c177
-rw-r--r--arch/x86/xen/multicalls.h6
-rw-r--r--arch/x86/xen/p2m.c43
-rw-r--r--arch/x86/xen/pci-swiotlb-xen.c2
-rw-r--r--arch/x86/xen/platform-pci-unplug.c2
-rw-r--r--arch/x86/xen/setup.c12
-rw-r--r--arch/x86/xen/smp.c20
-rw-r--r--arch/x86/xen/time.c14
-rw-r--r--arch/x86/xen/trace.c61
-rw-r--r--arch/x86/xen/vga.c67
-rw-r--r--arch/x86/xen/xen-ops.h13
352 files changed, 18878 insertions, 26370 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index 0e103236b75..0e9dec6cadd 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -15,3 +15,4 @@ obj-y += vdso/
obj-$(CONFIG_IA32_EMULATION) += ia32/
obj-y += platform/
+obj-y += net/
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cc6c53a95bf..7cf916fc1ce 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -8,6 +8,7 @@ config 64BIT
config X86_32
def_bool !64BIT
+ select CLKSRC_I8253
config X86_64
def_bool 64BIT
@@ -16,11 +17,10 @@ config X86_64
config X86
def_bool y
select HAVE_AOUT if X86_32
- select HAVE_READQ
- select HAVE_WRITEQ
select HAVE_UNSTABLE_SCHED_CLOCK
select HAVE_IDE
select HAVE_OPROFILE
+ select HAVE_PCSPKR_PLATFORM
select HAVE_PERF_EVENTS
select HAVE_IRQ_WORK
select HAVE_IOREMAP_PROT
@@ -65,13 +65,13 @@ config X86
select HAVE_GENERIC_HARDIRQS
select HAVE_SPARSE_IRQ
select GENERIC_FIND_FIRST_BIT
- select GENERIC_FIND_NEXT_BIT
select GENERIC_IRQ_PROBE
select GENERIC_PENDING_IRQ if SMP
select GENERIC_IRQ_SHOW
select IRQ_FORCED_THREADING
select USE_GENERIC_SMP_HELPERS if SMP
- select ARCH_NO_SYSDEV_OPS
+ select HAVE_BPF_JIT if (X86_64 && NET)
+ select CLKEVT_I8253
config INSTRUCTION_DECODER
def_bool (KPROBES || PERF_EVENTS)
@@ -95,6 +95,10 @@ config CLOCKSOURCE_WATCHDOG
config GENERIC_CLOCKEVENTS
def_bool y
+config ARCH_CLOCKSOURCE_DATA
+ def_bool y
+ depends on X86_64
+
config GENERIC_CLOCKEVENTS_BROADCAST
def_bool y
depends on X86_64 || (X86_32 && X86_LOCAL_APIC)
@@ -112,7 +116,14 @@ config MMU
def_bool y
config ZONE_DMA
- def_bool y
+ bool "DMA memory allocation support" if EXPERT
+ default y
+ help
+ DMA memory allocation support allows devices with less than 32-bit
+ addressing to allocate within the first 16MB of address space.
+ Disable if no such devices will be used.
+
+ If unsure, say Y.
config SBUS
bool
@@ -365,17 +376,6 @@ config X86_UV
# Following is an alphabetically sorted list of 32 bit extended platforms
# Please maintain the alphabetic order if and when there are additions
-config X86_ELAN
- bool "AMD Elan"
- depends on X86_32
- depends on X86_EXTENDED_PLATFORM
- ---help---
- Select this for an AMD Elan processor.
-
- Do not use this option for K6/Athlon/Opteron processors!
-
- If unsure, choose "PC-compatible" instead.
-
config X86_INTEL_CE
bool "CE4100 TV platform"
depends on PCI
@@ -390,12 +390,21 @@ config X86_INTEL_CE
This option compiles in support for the CE4100 SOC for settop
boxes and media devices.
+config X86_INTEL_MID
+ bool "Intel MID platform support"
+ depends on X86_32
+ depends on X86_EXTENDED_PLATFORM
+ ---help---
+ Select to build a kernel capable of supporting Intel MID platform
+ systems which do not have the PCI legacy interfaces (Moorestown,
+ Medfield). If you are building for a PC class system say N here.
+
+if X86_INTEL_MID
+
config X86_MRST
bool "Moorestown MID platform"
depends on PCI
depends on PCI_GOANY
- depends on X86_32
- depends on X86_EXTENDED_PLATFORM
depends on X86_IO_APIC
select APB_TIMER
select I2C
@@ -410,6 +419,8 @@ config X86_MRST
nor standard legacy replacement devices/features. e.g. Moorestown does
not contain i8259, i8254, HPET, legacy BIOS, most of the io ports.
+endif
+
config X86_RDC321X
bool "RDC R-321x SoC"
depends on X86_32
@@ -518,6 +529,18 @@ menuconfig PARAVIRT_GUEST
if PARAVIRT_GUEST
+config PARAVIRT_TIME_ACCOUNTING
+ bool "Paravirtual steal time accounting"
+ select PARAVIRT
+ default n
+ ---help---
+ Select this option to enable fine granularity task steal time
+ accounting. Time spent executing other tasks in parallel with
+ the current vCPU is discounted from the vCPU power. To account for
+ that, there can be a small performance impact.
+
+ If in doubt, say N here.
+
source "arch/x86/xen/Kconfig"
config KVM_CLOCK
@@ -623,6 +646,7 @@ config HPET_EMULATE_RTC
config APB_TIMER
def_bool y if MRST
prompt "Langwell APB Timer Support" if X86_MRST
+ select DW_APB_TIMER
help
APB timer is the replacement for 8254, HPET on X86 MID platforms.
The APBT provides a stable time base on SMP
@@ -686,32 +710,6 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
Calgary anyway, pass 'iommu=calgary' on the kernel command line.
If unsure, say Y.
-config AMD_IOMMU
- bool "AMD IOMMU support"
- select SWIOTLB
- select PCI_MSI
- depends on X86_64 && PCI && ACPI
- ---help---
- With this option you can enable support for AMD IOMMU hardware in
- your system. An IOMMU is a hardware component which provides
- remapping of DMA memory accesses from devices. With an AMD IOMMU you
- can isolate the the DMA memory of different devices and protect the
- system from misbehaving device drivers or hardware.
-
- You can find out if your system has an AMD IOMMU if you look into
- your BIOS for an option to enable it or if you have an IVRS ACPI
- table.
-
-config AMD_IOMMU_STATS
- bool "Export AMD IOMMU statistics to debugfs"
- depends on AMD_IOMMU
- select DEBUG_FS
- ---help---
- This option enables code in the AMD IOMMU driver to collect various
- statistics about whats happening in the driver and exports that
- information to userspace via debugfs.
- If unsure, say N.
-
# need this always selected by IOMMU for the VIA workaround
config SWIOTLB
def_bool y if X86_64
@@ -725,9 +723,6 @@ config SWIOTLB
config IOMMU_HELPER
def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU)
-config IOMMU_API
- def_bool (AMD_IOMMU || DMAR)
-
config MAXSMP
bool "Enable Maximum number of SMP Processors and NUMA Nodes"
depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
@@ -919,6 +914,7 @@ config TOSHIBA
config I8K
tristate "Dell laptop support"
+ select HWMON
---help---
This adds a driver to safely access the System Management Mode
of the CPU on the Dell Inspiron 8000. The System Management Mode
@@ -1201,7 +1197,7 @@ config NODES_SPAN_OTHER_NODES
config NUMA_EMU
bool "NUMA emulation"
- depends on X86_64 && NUMA
+ depends on NUMA
---help---
Enable NUMA emulation. A flat machine will be split
into virtual nodes when booted with "numa=fake=N", where N is the
@@ -1223,6 +1219,10 @@ config HAVE_ARCH_BOOTMEM
def_bool y
depends on X86_32 && NUMA
+config HAVE_ARCH_ALLOC_REMAP
+ def_bool y
+ depends on X86_32 && NUMA
+
config ARCH_HAVE_MEMORY_PRESENT
def_bool y
depends on X86_32 && DISCONTIGMEM
@@ -1231,13 +1231,9 @@ config NEED_NODE_MEMMAP_SIZE
def_bool y
depends on X86_32 && (DISCONTIGMEM || SPARSEMEM)
-config HAVE_ARCH_ALLOC_REMAP
- def_bool y
- depends on X86_32 && NUMA
-
config ARCH_FLATMEM_ENABLE
def_bool y
- depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && !NUMA
+ depends on X86_32 && !NUMA
config ARCH_DISCONTIGMEM_ENABLE
def_bool y
@@ -1247,20 +1243,16 @@ config ARCH_DISCONTIGMEM_DEFAULT
def_bool y
depends on NUMA && X86_32
-config ARCH_PROC_KCORE_TEXT
- def_bool y
- depends on X86_64 && PROC_KCORE
-
-config ARCH_SPARSEMEM_DEFAULT
- def_bool y
- depends on X86_64
-
config ARCH_SPARSEMEM_ENABLE
def_bool y
depends on X86_64 || NUMA || (EXPERIMENTAL && X86_32) || X86_32_NON_STANDARD
select SPARSEMEM_STATIC if X86_32
select SPARSEMEM_VMEMMAP_ENABLE if X86_64
+config ARCH_SPARSEMEM_DEFAULT
+ def_bool y
+ depends on X86_64
+
config ARCH_SELECT_MEMORY_MODEL
def_bool y
depends on ARCH_SPARSEMEM_ENABLE
@@ -1269,6 +1261,10 @@ config ARCH_MEMORY_PROBE
def_bool X86_64
depends on MEMORY_HOTPLUG
+config ARCH_PROC_KCORE_TEXT
+ def_bool y
+ depends on X86_64 && PROC_KCORE
+
config ILLEGAL_POINTER_VALUE
hex
default 0 if X86_32
@@ -1703,10 +1699,6 @@ config ARCH_ENABLE_MEMORY_HOTREMOVE
def_bool y
depends on MEMORY_HOTPLUG
-config HAVE_ARCH_EARLY_PFN_TO_NID
- def_bool X86_64
- depends on NUMA
-
config USE_PERCPU_NUMA_NODE_ID
def_bool y
depends on NUMA
@@ -1745,8 +1737,8 @@ menuconfig APM
machines with more than one CPU.
In order to use APM, you will need supporting software. For location
- and more information, read <file:Documentation/power/pm.txt> and the
- Battery Powered Linux mini-HOWTO, available from
+ and more information, read <file:Documentation/power/apm-acpi.txt>
+ and the Battery Powered Linux mini-HOWTO, available from
<http://www.tldp.org/docs.html#howto>.
This driver does not spin down disk drives (see the hdparm(8)
@@ -1848,7 +1840,7 @@ config APM_ALLOW_INTS
endif # APM
-source "arch/x86/kernel/cpu/cpufreq/Kconfig"
+source "drivers/cpufreq/Kconfig"
source "drivers/cpuidle/Kconfig"
@@ -1913,7 +1905,7 @@ config PCI_BIOS
# x86-64 doesn't support PCI BIOS access from long mode so always go direct.
config PCI_DIRECT
def_bool y
- depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY || PCI_GOOLPC))
+ depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY || PCI_GOOLPC || PCI_GOMMCONFIG))
config PCI_MMCONFIG
def_bool y
@@ -1950,55 +1942,6 @@ config PCI_CNB20LE_QUIRK
You should say N unless you know you need this.
-config DMAR
- bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
- depends on PCI_MSI && ACPI && EXPERIMENTAL
- help
- DMA remapping (DMAR) devices support enables independent address
- translations for Direct Memory Access (DMA) from devices.
- These DMA remapping devices are reported via ACPI tables
- and include PCI device scope covered by these DMA
- remapping devices.
-
-config DMAR_DEFAULT_ON
- def_bool y
- prompt "Enable DMA Remapping Devices by default"
- depends on DMAR
- help
- Selecting this option will enable a DMAR device at boot time if
- one is found. If this option is not selected, DMAR support can
- be enabled by passing intel_iommu=on to the kernel. It is
- recommended you say N here while the DMAR code remains
- experimental.
-
-config DMAR_BROKEN_GFX_WA
- bool "Workaround broken graphics drivers (going away soon)"
- depends on DMAR && BROKEN
- ---help---
- Current Graphics drivers tend to use physical address
- for DMA and avoid using DMA APIs. Setting this config
- option permits the IOMMU driver to set a unity map for
- all the OS-visible memory. Hence the driver can continue
- to use physical addresses for DMA, at least until this
- option is removed in the 2.6.32 kernel.
-
-config DMAR_FLOPPY_WA
- def_bool y
- depends on DMAR
- ---help---
- Floppy disk drivers are known to bypass DMA API calls
- thereby failing to work when IOMMU is enabled. This
- workaround will setup a 1:1 mapping for the first
- 16MiB to make floppy (an ISA device) work.
-
-config INTR_REMAP
- bool "Support for Interrupt Remapping (EXPERIMENTAL)"
- depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL
- ---help---
- Supports Interrupt remapping for IO-APIC and MSI devices.
- To use x2apic mode in the CPU's which support x2APIC enhancements or
- to support platforms with CPU's having > 8 bit APIC ID, say Y.
-
source "drivers/pci/pcie/Kconfig"
source "drivers/pci/Kconfig"
@@ -2076,16 +2019,49 @@ config OLPC
depends on !X86_PAE
select GPIOLIB
select OF
- select OF_PROMTREE if PROC_DEVICETREE
+ select OF_PROMTREE
---help---
Add support for detecting the unique features of the OLPC
XO hardware.
-config OLPC_XO1
- tristate "OLPC XO-1 support"
- depends on OLPC && MFD_CS5535
- ---help---
- Add support for non-essential features of the OLPC XO-1 laptop.
+config OLPC_XO1_PM
+ bool "OLPC XO-1 Power Management"
+ depends on OLPC && MFD_CS5535 && PM_SLEEP
+ select MFD_CORE
+ ---help---
+ Add support for poweroff and suspend of the OLPC XO-1 laptop.
+
+config OLPC_XO1_RTC
+ bool "OLPC XO-1 Real Time Clock"
+ depends on OLPC_XO1_PM && RTC_DRV_CMOS
+ ---help---
+ Add support for the XO-1 real time clock, which can be used as a
+ programmable wakeup source.
+
+config OLPC_XO1_SCI
+ bool "OLPC XO-1 SCI extras"
+ depends on OLPC && OLPC_XO1_PM
+ select POWER_SUPPLY
+ select GPIO_CS5535
+ select MFD_CORE
+ ---help---
+ Add support for SCI-based features of the OLPC XO-1 laptop:
+ - EC-driven system wakeups
+ - Power button
+ - Ebook switch
+ - Lid switch
+ - AC adapter status updates
+ - Battery status updates
+
+config OLPC_XO15_SCI
+ bool "OLPC XO-1.5 SCI extras"
+ depends on OLPC && ACPI
+ select POWER_SUPPLY
+ ---help---
+ Add support for SCI-based features of the OLPC XO-1.5 laptop:
+ - EC-driven system wakeups
+ - AC adapter status updates
+ - Battery status updates
endif # X86_32
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index d161e939df6..e3ca7e0d858 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -1,6 +1,4 @@
# Put here option for CPU selection and depending optimization
-if !X86_ELAN
-
choice
prompt "Processor family"
default M686 if X86_32
@@ -203,6 +201,14 @@ config MWINCHIP3D
stores for this CPU, which can increase performance of some
operations.
+config MELAN
+ bool "AMD Elan"
+ depends on X86_32
+ ---help---
+ Select this for an AMD Elan processor.
+
+ Do not use this option for K6/Athlon/Opteron processors!
+
config MGEODEGX1
bool "GeodeGX1"
depends on X86_32
@@ -292,8 +298,6 @@ config X86_GENERIC
This is really intended for distributors who need more
generic optimizations.
-endif
-
#
# Define implied options from the CPU selection here
config X86_INTERNODE_CACHE_SHIFT
@@ -308,11 +312,14 @@ config X86_CMPXCHG
config CMPXCHG_LOCAL
def_bool X86_64 || (X86_32 && !M386)
+config CMPXCHG_DOUBLE
+ def_bool y
+
config X86_L1_CACHE_SHIFT
int
default "7" if MPENTIUM4 || MPSC
default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
- default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
+ default "4" if MELAN || M486 || M386 || MGEODEGX1
default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
config X86_XADD
@@ -358,7 +365,7 @@ config X86_POPAD_OK
config X86_ALIGNMENT_16
def_bool y
- depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
+ depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
config X86_INTEL_USERCOPY
def_bool y
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 615e18810f4..c0f8a5c8891 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -66,26 +66,6 @@ config DEBUG_STACKOVERFLOW
This option will cause messages to be printed if free stack space
drops below a certain limit.
-config DEBUG_STACK_USAGE
- bool "Stack utilization instrumentation"
- depends on DEBUG_KERNEL
- ---help---
- Enables the display of the minimum amount of free stack which each
- task has ever had available in the sysrq-T and sysrq-P debug output.
-
- This option will slow down process creation somewhat.
-
-config DEBUG_PER_CPU_MAPS
- bool "Debug access to per_cpu maps"
- depends on DEBUG_KERNEL
- depends on SMP
- ---help---
- Say Y to verify that the per_cpu map being accessed has
- been setup. Adds a fair amount of code to kernel memory
- and decreases performance.
-
- Say N if unsure.
-
config X86_PTDUMP
bool "Export kernel pagetable layout to userspace via debugfs"
depends on DEBUG_KERNEL
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
index f2ee1abb1df..86cee7b749e 100644
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -37,7 +37,7 @@ cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom,$(call cc-option,-march=
$(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic))
# AMD Elan support
-cflags-$(CONFIG_X86_ELAN) += -march=i486
+cflags-$(CONFIG_MELAN) += -march=i486
# Geode GX1 support
cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index f7cb086b4ad..95365a82b6a 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -9,12 +9,6 @@
# Changed by many, many contributors over the years.
#
-# ROOT_DEV specifies the default root-device when making the image.
-# This can be either FLOPPY, CURRENT, /dev/xxxx or empty, in which case
-# the default of FLOPPY is used by 'build'.
-
-ROOT_DEV := CURRENT
-
# If you want to preset the SVGA mode, uncomment the next line and
# set SVGA_MODE to whatever number you want.
# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
@@ -75,8 +69,7 @@ GCOV_PROFILE := n
$(obj)/bzImage: asflags-y := $(SVGA_MODE)
quiet_cmd_image = BUILD $@
-cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin \
- $(ROOT_DEV) > $@
+cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin > $@
$(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/tools/build FORCE
$(call if_changed,image)
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index cae3feb1035..db75d07c364 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -91,7 +91,7 @@ static int detect_memory_e801(void)
if (oreg.ax > 15*1024) {
return -1; /* Bogus! */
} else if (oreg.ax == 15*1024) {
- boot_params.alt_mem_k = (oreg.dx << 6) + oreg.ax;
+ boot_params.alt_mem_k = (oreg.bx << 6) + oreg.ax;
} else {
/*
* This ignores memory above 16MB if we have a memory
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index ee3a4ea923a..fdc60a0b3c2 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -130,7 +130,7 @@ static void die(const char * str, ...)
static void usage(void)
{
- die("Usage: build setup system [rootdev] [> image]");
+ die("Usage: build setup system [> image]");
}
int main(int argc, char ** argv)
@@ -138,39 +138,14 @@ int main(int argc, char ** argv)
unsigned int i, sz, setup_sectors;
int c;
u32 sys_size;
- u8 major_root, minor_root;
struct stat sb;
FILE *file;
int fd;
void *kernel;
u32 crc = 0xffffffffUL;
- if ((argc < 3) || (argc > 4))
+ if (argc != 3)
usage();
- if (argc > 3) {
- if (!strcmp(argv[3], "CURRENT")) {
- if (stat("/", &sb)) {
- perror("/");
- die("Couldn't stat /");
- }
- major_root = major(sb.st_dev);
- minor_root = minor(sb.st_dev);
- } else if (strcmp(argv[3], "FLOPPY")) {
- if (stat(argv[3], &sb)) {
- perror(argv[3]);
- die("Couldn't stat root device.");
- }
- major_root = major(sb.st_rdev);
- minor_root = minor(sb.st_rdev);
- } else {
- major_root = 0;
- minor_root = 0;
- }
- } else {
- major_root = DEFAULT_MAJOR_ROOT;
- minor_root = DEFAULT_MINOR_ROOT;
- }
- fprintf(stderr, "Root device is (%d, %d)\n", major_root, minor_root);
/* Copy the setup code */
file = fopen(argv[1], "r");
@@ -193,8 +168,8 @@ int main(int argc, char ** argv)
memset(buf+c, 0, i-c);
/* Set the default root device */
- buf[508] = minor_root;
- buf[509] = major_root;
+ buf[508] = DEFAULT_MINOR_ROOT;
+ buf[509] = DEFAULT_MAJOR_ROOT;
fprintf(stderr, "Setup is %d bytes (padded to %d bytes).\n", c, i);
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 6f9872658dd..2bf18059fbe 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -10,7 +10,6 @@ CONFIG_TASK_IO_ACCOUNTING=y
CONFIG_AUDIT=y
CONFIG_LOG_BUF_SHIFT=18
CONFIG_CGROUPS=y
-CONFIG_CGROUP_NS=y
CONFIG_CGROUP_FREEZER=y
CONFIG_CPUSETS=y
CONFIG_CGROUP_CPUACCT=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index ee01a9d5d4f..22a0dc8e51d 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -11,7 +11,6 @@ CONFIG_TASK_IO_ACCOUNTING=y
CONFIG_AUDIT=y
CONFIG_LOG_BUF_SHIFT=18
CONFIG_CGROUPS=y
-CONFIG_CGROUP_NS=y
CONFIG_CGROUP_FREEZER=y
CONFIG_CPUSETS=y
CONFIG_CGROUP_CPUACCT=y
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 1a58ad89fdf..c04f1b7a913 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -2,8 +2,6 @@
# Arch-specific CryptoAPI modules.
#
-obj-$(CONFIG_CRYPTO_FPU) += fpu.o
-
obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
@@ -24,6 +22,6 @@ aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
-aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
+aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 2577613fb32..feee8ff1d05 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -94,6 +94,10 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
+
+int crypto_fpu_init(void);
+void crypto_fpu_exit(void);
+
#ifdef CONFIG_X86_64
asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
@@ -1257,6 +1261,8 @@ static int __init aesni_init(void)
return -ENODEV;
}
+ if ((err = crypto_fpu_init()))
+ goto fpu_err;
if ((err = crypto_register_alg(&aesni_alg)))
goto aes_err;
if ((err = crypto_register_alg(&__aesni_alg)))
@@ -1334,6 +1340,7 @@ blk_ecb_err:
__aes_err:
crypto_unregister_alg(&aesni_alg);
aes_err:
+fpu_err:
return err;
}
@@ -1363,6 +1370,8 @@ static void __exit aesni_exit(void)
crypto_unregister_alg(&blk_ecb_alg);
crypto_unregister_alg(&__aesni_alg);
crypto_unregister_alg(&aesni_alg);
+
+ crypto_fpu_exit();
}
module_init(aesni_init);
diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c
index 1a8f8649c03..98d7a188f46 100644
--- a/arch/x86/crypto/fpu.c
+++ b/arch/x86/crypto/fpu.c
@@ -150,18 +150,12 @@ static struct crypto_template crypto_fpu_tmpl = {
.module = THIS_MODULE,
};
-static int __init crypto_fpu_module_init(void)
+int __init crypto_fpu_init(void)
{
return crypto_register_template(&crypto_fpu_tmpl);
}
-static void __exit crypto_fpu_module_exit(void)
+void __exit crypto_fpu_exit(void)
{
crypto_unregister_template(&crypto_fpu_tmpl);
}
-
-module_init(crypto_fpu_module_init);
-module_exit(crypto_fpu_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("FPU block cipher wrapper");
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index 7a6e68e4f74..976aa64d9a2 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -245,7 +245,7 @@ static int ghash_async_setkey(struct crypto_ahash *tfm, const u8 *key,
crypto_ahash_set_flags(tfm, crypto_ahash_get_flags(child)
& CRYPTO_TFM_RES_MASK);
- return 0;
+ return err;
}
static int ghash_async_init_tfm(struct crypto_tfm *tfm)
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 588a7aa937e..65577698cab 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -127,15 +127,17 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
asmlinkage long sys32_sigsuspend(int history0, int history1, old_sigset_t mask)
{
- mask &= _BLOCKABLE;
- spin_lock_irq(&current->sighand->siglock);
+ sigset_t blocked;
+
current->saved_sigmask = current->blocked;
- siginitset(&current->blocked, mask);
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
+
+ mask &= _BLOCKABLE;
+ siginitset(&blocked, mask);
+ set_current_blocked(&blocked);
current->state = TASK_INTERRUPTIBLE;
schedule();
+
set_restore_sigmask();
return -ERESTARTNOHAND;
}
@@ -279,10 +281,7 @@ asmlinkage long sys32_sigreturn(struct pt_regs *regs)
goto badframe;
sigdelsetmask(&set, ~_BLOCKABLE);
- spin_lock_irq(&current->sighand->siglock);
- current->blocked = set;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
+ set_current_blocked(&set);
if (ia32_restore_sigcontext(regs, &frame->sc, &ax))
goto badframe;
@@ -308,10 +307,7 @@ asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs)
goto badframe;
sigdelsetmask(&set, ~_BLOCKABLE);
- spin_lock_irq(&current->sighand->siglock);
- current->blocked = set;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
+ set_current_blocked(&set);
if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
goto badframe;
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 849a9d23c71..a0e866d233e 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -143,7 +143,7 @@ ENTRY(ia32_sysenter_target)
CFI_REL_OFFSET rip,0
pushq_cfi %rax
cld
- SAVE_ARGS 0,0,1
+ SAVE_ARGS 0,1,0
/* no need to do an access_ok check here because rbp has been
32bit zero extended */
1: movl (%rbp),%ebp
@@ -173,7 +173,7 @@ sysexit_from_sys_call:
andl $~0x200,EFLAGS-R11(%rsp)
movl RIP-R11(%rsp),%edx /* User %eip */
CFI_REGISTER rip,rdx
- RESTORE_ARGS 1,24,1,1,1,1
+ RESTORE_ARGS 0,24,0,0,0,0
xorq %r8,%r8
xorq %r9,%r9
xorq %r10,%r10
@@ -289,7 +289,7 @@ ENTRY(ia32_cstar_target)
* disabled irqs and here we enable it straight after entry:
*/
ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_ARGS 8,1,1
+ SAVE_ARGS 8,0,0
movl %eax,%eax /* zero extension */
movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
movq %rcx,RIP-ARGOFFSET(%rsp)
@@ -328,7 +328,7 @@ cstar_dispatch:
jnz sysretl_audit
sysretl_from_sys_call:
andl $~TS_COMPAT,TI_status(%r10)
- RESTORE_ARGS 1,-ARG_SKIP,1,1,1
+ RESTORE_ARGS 0,-ARG_SKIP,0,0,0
movl RIP-ARGOFFSET(%rsp),%ecx
CFI_REGISTER rip,rcx
movl EFLAGS-ARGOFFSET(%rsp),%r11d
@@ -419,7 +419,7 @@ ENTRY(ia32_syscall)
cld
/* note the registers are not zero extended to the sf.
this could be a problem. */
- SAVE_ARGS 0,0,1
+ SAVE_ARGS 0,1,0
GET_THREAD_INFO(%r10)
orl $TS_COMPAT,TI_status(%r10)
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
@@ -848,4 +848,6 @@ ia32_sys_call_table:
.quad compat_sys_open_by_handle_at
.quad compat_sys_clock_adjtime
.quad sys_syncfs
+ .quad compat_sys_sendmmsg /* 345 */
+ .quad sys_setns
ia32_syscall_end:
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 5852519b2d0..f6f5c53dc90 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -43,7 +43,7 @@
#include <asm/mman.h>
#include <asm/types.h>
#include <asm/uaccess.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <asm/vgtod.h>
#include <asm/sys_ia32.h>
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 12e0e7dd869..610001d385d 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -139,7 +139,7 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
boot_cpu_data.x86_model <= 0x05 &&
boot_cpu_data.x86_mask < 0x0A)
return 1;
- else if (c1e_detected)
+ else if (amd_e400_c1e_detected)
return 1;
else
return max_cstate;
@@ -183,8 +183,6 @@ static inline void disable_acpi(void) { }
#define ARCH_HAS_POWER_INIT 1
-struct bootnode;
-
#ifdef CONFIG_ACPI_NUMA
extern int acpi_numa;
extern int x86_acpi_numa_init(void);
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index a63a68be1cc..4554cc6fb96 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -15,4 +15,13 @@
.endm
#endif
+.macro altinstruction_entry orig alt feature orig_len alt_len
+ .align 8
+ .long \orig - .
+ .long \alt - .
+ .word \feature
+ .byte \orig_len
+ .byte \alt_len
+.endm
+
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 13009d1af99..23fb6d79f20 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -4,7 +4,6 @@
#include <linux/types.h>
#include <linux/stddef.h>
#include <linux/stringify.h>
-#include <linux/jump_label.h>
#include <asm/asm.h>
/*
@@ -44,8 +43,8 @@
#endif
struct alt_instr {
- u8 *instr; /* original instruction */
- u8 *replacement;
+ s32 instr_offset; /* original instruction */
+ s32 repl_offset; /* offset to replacement instruction */
u16 cpuid; /* cpuid bit set for replacement */
u8 instrlen; /* length of original instruction */
u8 replacementlen; /* length of new instruction, <= instrlen */
@@ -85,8 +84,8 @@ static inline int alternatives_text_reserved(void *start, void *end)
"661:\n\t" oldinstr "\n662:\n" \
".section .altinstructions,\"a\"\n" \
_ASM_ALIGN "\n" \
- _ASM_PTR "661b\n" /* label */ \
- _ASM_PTR "663f\n" /* new instruction */ \
+ " .long 661b - .\n" /* label */ \
+ " .long 663f - .\n" /* new instruction */ \
" .word " __stringify(feature) "\n" /* feature bit */ \
" .byte 662b-661b\n" /* sourcelen */ \
" .byte 664f-663f\n" /* replacementlen */ \
@@ -191,12 +190,4 @@ extern void *text_poke(void *addr, const void *opcode, size_t len);
extern void *text_poke_smp(void *addr, const void *opcode, size_t len);
extern void text_poke_smp_batch(struct text_poke_param *params, int n);
-#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
-#define IDEAL_NOP_SIZE_5 5
-extern unsigned char ideal_nop5[IDEAL_NOP_SIZE_5];
-extern void arch_init_ideal_nop5(void);
-#else
-static inline void arch_init_ideal_nop5(void) {}
-#endif
-
#endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
deleted file mode 100644
index a6863a2dec1..00000000000
--- a/arch/x86/include/asm/amd_iommu.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- * Leo Duran <leo.duran@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#ifndef _ASM_X86_AMD_IOMMU_H
-#define _ASM_X86_AMD_IOMMU_H
-
-#include <linux/irqreturn.h>
-
-#ifdef CONFIG_AMD_IOMMU
-
-extern int amd_iommu_detect(void);
-
-#else
-
-static inline int amd_iommu_detect(void) { return -ENODEV; }
-
-#endif
-
-#endif /* _ASM_X86_AMD_IOMMU_H */
diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h
deleted file mode 100644
index 916bc8111a0..00000000000
--- a/arch/x86/include/asm/amd_iommu_proto.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (C) 2009-2010 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#ifndef _ASM_X86_AMD_IOMMU_PROTO_H
-#define _ASM_X86_AMD_IOMMU_PROTO_H
-
-struct amd_iommu;
-
-extern int amd_iommu_init_dma_ops(void);
-extern int amd_iommu_init_passthrough(void);
-extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
-extern void amd_iommu_flush_all_domains(void);
-extern void amd_iommu_flush_all_devices(void);
-extern void amd_iommu_apply_erratum_63(u16 devid);
-extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu);
-extern int amd_iommu_init_devices(void);
-extern void amd_iommu_uninit_devices(void);
-extern void amd_iommu_init_notifier(void);
-extern void amd_iommu_init_api(void);
-#ifndef CONFIG_AMD_IOMMU_STATS
-
-static inline void amd_iommu_stats_init(void) { }
-
-#endif /* !CONFIG_AMD_IOMMU_STATS */
-
-static inline bool is_rd890_iommu(struct pci_dev *pdev)
-{
- return (pdev->vendor == PCI_VENDOR_ID_ATI) &&
- (pdev->device == PCI_DEVICE_ID_RD890_IOMMU);
-}
-
-#endif /* _ASM_X86_AMD_IOMMU_PROTO_H */
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
deleted file mode 100644
index e3509fc303b..00000000000
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ /dev/null
@@ -1,560 +0,0 @@
-/*
- * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- * Leo Duran <leo.duran@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#ifndef _ASM_X86_AMD_IOMMU_TYPES_H
-#define _ASM_X86_AMD_IOMMU_TYPES_H
-
-#include <linux/types.h>
-#include <linux/mutex.h>
-#include <linux/list.h>
-#include <linux/spinlock.h>
-
-/*
- * Maximum number of IOMMUs supported
- */
-#define MAX_IOMMUS 32
-
-/*
- * some size calculation constants
- */
-#define DEV_TABLE_ENTRY_SIZE 32
-#define ALIAS_TABLE_ENTRY_SIZE 2
-#define RLOOKUP_TABLE_ENTRY_SIZE (sizeof(void *))
-
-/* Length of the MMIO region for the AMD IOMMU */
-#define MMIO_REGION_LENGTH 0x4000
-
-/* Capability offsets used by the driver */
-#define MMIO_CAP_HDR_OFFSET 0x00
-#define MMIO_RANGE_OFFSET 0x0c
-#define MMIO_MISC_OFFSET 0x10
-
-/* Masks, shifts and macros to parse the device range capability */
-#define MMIO_RANGE_LD_MASK 0xff000000
-#define MMIO_RANGE_FD_MASK 0x00ff0000
-#define MMIO_RANGE_BUS_MASK 0x0000ff00
-#define MMIO_RANGE_LD_SHIFT 24
-#define MMIO_RANGE_FD_SHIFT 16
-#define MMIO_RANGE_BUS_SHIFT 8
-#define MMIO_GET_LD(x) (((x) & MMIO_RANGE_LD_MASK) >> MMIO_RANGE_LD_SHIFT)
-#define MMIO_GET_FD(x) (((x) & MMIO_RANGE_FD_MASK) >> MMIO_RANGE_FD_SHIFT)
-#define MMIO_GET_BUS(x) (((x) & MMIO_RANGE_BUS_MASK) >> MMIO_RANGE_BUS_SHIFT)
-#define MMIO_MSI_NUM(x) ((x) & 0x1f)
-
-/* Flag masks for the AMD IOMMU exclusion range */
-#define MMIO_EXCL_ENABLE_MASK 0x01ULL
-#define MMIO_EXCL_ALLOW_MASK 0x02ULL
-
-/* Used offsets into the MMIO space */
-#define MMIO_DEV_TABLE_OFFSET 0x0000
-#define MMIO_CMD_BUF_OFFSET 0x0008
-#define MMIO_EVT_BUF_OFFSET 0x0010
-#define MMIO_CONTROL_OFFSET 0x0018
-#define MMIO_EXCL_BASE_OFFSET 0x0020
-#define MMIO_EXCL_LIMIT_OFFSET 0x0028
-#define MMIO_CMD_HEAD_OFFSET 0x2000
-#define MMIO_CMD_TAIL_OFFSET 0x2008
-#define MMIO_EVT_HEAD_OFFSET 0x2010
-#define MMIO_EVT_TAIL_OFFSET 0x2018
-#define MMIO_STATUS_OFFSET 0x2020
-
-/* MMIO status bits */
-#define MMIO_STATUS_COM_WAIT_INT_MASK 0x04
-
-/* event logging constants */
-#define EVENT_ENTRY_SIZE 0x10
-#define EVENT_TYPE_SHIFT 28
-#define EVENT_TYPE_MASK 0xf
-#define EVENT_TYPE_ILL_DEV 0x1
-#define EVENT_TYPE_IO_FAULT 0x2
-#define EVENT_TYPE_DEV_TAB_ERR 0x3
-#define EVENT_TYPE_PAGE_TAB_ERR 0x4
-#define EVENT_TYPE_ILL_CMD 0x5
-#define EVENT_TYPE_CMD_HARD_ERR 0x6
-#define EVENT_TYPE_IOTLB_INV_TO 0x7
-#define EVENT_TYPE_INV_DEV_REQ 0x8
-#define EVENT_DEVID_MASK 0xffff
-#define EVENT_DEVID_SHIFT 0
-#define EVENT_DOMID_MASK 0xffff
-#define EVENT_DOMID_SHIFT 0
-#define EVENT_FLAGS_MASK 0xfff
-#define EVENT_FLAGS_SHIFT 0x10
-
-/* feature control bits */
-#define CONTROL_IOMMU_EN 0x00ULL
-#define CONTROL_HT_TUN_EN 0x01ULL
-#define CONTROL_EVT_LOG_EN 0x02ULL
-#define CONTROL_EVT_INT_EN 0x03ULL
-#define CONTROL_COMWAIT_EN 0x04ULL
-#define CONTROL_PASSPW_EN 0x08ULL
-#define CONTROL_RESPASSPW_EN 0x09ULL
-#define CONTROL_COHERENT_EN 0x0aULL
-#define CONTROL_ISOC_EN 0x0bULL
-#define CONTROL_CMDBUF_EN 0x0cULL
-#define CONTROL_PPFLOG_EN 0x0dULL
-#define CONTROL_PPFINT_EN 0x0eULL
-
-/* command specific defines */
-#define CMD_COMPL_WAIT 0x01
-#define CMD_INV_DEV_ENTRY 0x02
-#define CMD_INV_IOMMU_PAGES 0x03
-
-#define CMD_COMPL_WAIT_STORE_MASK 0x01
-#define CMD_COMPL_WAIT_INT_MASK 0x02
-#define CMD_INV_IOMMU_PAGES_SIZE_MASK 0x01
-#define CMD_INV_IOMMU_PAGES_PDE_MASK 0x02
-
-#define CMD_INV_IOMMU_ALL_PAGES_ADDRESS 0x7fffffffffffffffULL
-
-/* macros and definitions for device table entries */
-#define DEV_ENTRY_VALID 0x00
-#define DEV_ENTRY_TRANSLATION 0x01
-#define DEV_ENTRY_IR 0x3d
-#define DEV_ENTRY_IW 0x3e
-#define DEV_ENTRY_NO_PAGE_FAULT 0x62
-#define DEV_ENTRY_EX 0x67
-#define DEV_ENTRY_SYSMGT1 0x68
-#define DEV_ENTRY_SYSMGT2 0x69
-#define DEV_ENTRY_INIT_PASS 0xb8
-#define DEV_ENTRY_EINT_PASS 0xb9
-#define DEV_ENTRY_NMI_PASS 0xba
-#define DEV_ENTRY_LINT0_PASS 0xbe
-#define DEV_ENTRY_LINT1_PASS 0xbf
-#define DEV_ENTRY_MODE_MASK 0x07
-#define DEV_ENTRY_MODE_SHIFT 0x09
-
-/* constants to configure the command buffer */
-#define CMD_BUFFER_SIZE 8192
-#define CMD_BUFFER_UNINITIALIZED 1
-#define CMD_BUFFER_ENTRIES 512
-#define MMIO_CMD_SIZE_SHIFT 56
-#define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT)
-
-/* constants for event buffer handling */
-#define EVT_BUFFER_SIZE 8192 /* 512 entries */
-#define EVT_LEN_MASK (0x9ULL << 56)
-
-#define PAGE_MODE_NONE 0x00
-#define PAGE_MODE_1_LEVEL 0x01
-#define PAGE_MODE_2_LEVEL 0x02
-#define PAGE_MODE_3_LEVEL 0x03
-#define PAGE_MODE_4_LEVEL 0x04
-#define PAGE_MODE_5_LEVEL 0x05
-#define PAGE_MODE_6_LEVEL 0x06
-
-#define PM_LEVEL_SHIFT(x) (12 + ((x) * 9))
-#define PM_LEVEL_SIZE(x) (((x) < 6) ? \
- ((1ULL << PM_LEVEL_SHIFT((x))) - 1): \
- (0xffffffffffffffffULL))
-#define PM_LEVEL_INDEX(x, a) (((a) >> PM_LEVEL_SHIFT((x))) & 0x1ffULL)
-#define PM_LEVEL_ENC(x) (((x) << 9) & 0xe00ULL)
-#define PM_LEVEL_PDE(x, a) ((a) | PM_LEVEL_ENC((x)) | \
- IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW)
-#define PM_PTE_LEVEL(pte) (((pte) >> 9) & 0x7ULL)
-
-#define PM_MAP_4k 0
-#define PM_ADDR_MASK 0x000ffffffffff000ULL
-#define PM_MAP_MASK(lvl) (PM_ADDR_MASK & \
- (~((1ULL << (12 + ((lvl) * 9))) - 1)))
-#define PM_ALIGNED(lvl, addr) ((PM_MAP_MASK(lvl) & (addr)) == (addr))
-
-/*
- * Returns the page table level to use for a given page size
- * Pagesize is expected to be a power-of-two
- */
-#define PAGE_SIZE_LEVEL(pagesize) \
- ((__ffs(pagesize) - 12) / 9)
-/*
- * Returns the number of ptes to use for a given page size
- * Pagesize is expected to be a power-of-two
- */
-#define PAGE_SIZE_PTE_COUNT(pagesize) \
- (1ULL << ((__ffs(pagesize) - 12) % 9))
-
-/*
- * Aligns a given io-virtual address to a given page size
- * Pagesize is expected to be a power-of-two
- */
-#define PAGE_SIZE_ALIGN(address, pagesize) \
- ((address) & ~((pagesize) - 1))
-/*
- * Creates an IOMMU PTE for an address an a given pagesize
- * The PTE has no permission bits set
- * Pagesize is expected to be a power-of-two larger than 4096
- */
-#define PAGE_SIZE_PTE(address, pagesize) \
- (((address) | ((pagesize) - 1)) & \
- (~(pagesize >> 1)) & PM_ADDR_MASK)
-
-/*
- * Takes a PTE value with mode=0x07 and returns the page size it maps
- */
-#define PTE_PAGE_SIZE(pte) \
- (1ULL << (1 + ffz(((pte) | 0xfffULL))))
-
-#define IOMMU_PTE_P (1ULL << 0)
-#define IOMMU_PTE_TV (1ULL << 1)
-#define IOMMU_PTE_U (1ULL << 59)
-#define IOMMU_PTE_FC (1ULL << 60)
-#define IOMMU_PTE_IR (1ULL << 61)
-#define IOMMU_PTE_IW (1ULL << 62)
-
-#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
-#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P)
-#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK))
-#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
-
-#define IOMMU_PROT_MASK 0x03
-#define IOMMU_PROT_IR 0x01
-#define IOMMU_PROT_IW 0x02
-
-/* IOMMU capabilities */
-#define IOMMU_CAP_IOTLB 24
-#define IOMMU_CAP_NPCACHE 26
-
-#define MAX_DOMAIN_ID 65536
-
-/* FIXME: move this macro to <linux/pci.h> */
-#define PCI_BUS(x) (((x) >> 8) & 0xff)
-
-/* Protection domain flags */
-#define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */
-#define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops
- domain for an IOMMU */
-#define PD_PASSTHROUGH_MASK (1UL << 2) /* domain has no page
- translation */
-
-extern bool amd_iommu_dump;
-#define DUMP_printk(format, arg...) \
- do { \
- if (amd_iommu_dump) \
- printk(KERN_INFO "AMD-Vi: " format, ## arg); \
- } while(0);
-
-/* global flag if IOMMUs cache non-present entries */
-extern bool amd_iommu_np_cache;
-
-/*
- * Make iterating over all IOMMUs easier
- */
-#define for_each_iommu(iommu) \
- list_for_each_entry((iommu), &amd_iommu_list, list)
-#define for_each_iommu_safe(iommu, next) \
- list_for_each_entry_safe((iommu), (next), &amd_iommu_list, list)
-
-#define APERTURE_RANGE_SHIFT 27 /* 128 MB */
-#define APERTURE_RANGE_SIZE (1ULL << APERTURE_RANGE_SHIFT)
-#define APERTURE_RANGE_PAGES (APERTURE_RANGE_SIZE >> PAGE_SHIFT)
-#define APERTURE_MAX_RANGES 32 /* allows 4GB of DMA address space */
-#define APERTURE_RANGE_INDEX(a) ((a) >> APERTURE_RANGE_SHIFT)
-#define APERTURE_PAGE_INDEX(a) (((a) >> 21) & 0x3fULL)
-
-/*
- * This structure contains generic data for IOMMU protection domains
- * independent of their use.
- */
-struct protection_domain {
- struct list_head list; /* for list of all protection domains */
- struct list_head dev_list; /* List of all devices in this domain */
- spinlock_t lock; /* mostly used to lock the page table*/
- struct mutex api_lock; /* protect page tables in the iommu-api path */
- u16 id; /* the domain id written to the device table */
- int mode; /* paging mode (0-6 levels) */
- u64 *pt_root; /* page table root pointer */
- unsigned long flags; /* flags to find out type of domain */
- bool updated; /* complete domain flush required */
- unsigned dev_cnt; /* devices assigned to this domain */
- unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */
- void *priv; /* private data */
-
-};
-
-/*
- * This struct contains device specific data for the IOMMU
- */
-struct iommu_dev_data {
- struct list_head list; /* For domain->dev_list */
- struct device *dev; /* Device this data belong to */
- struct device *alias; /* The Alias Device */
- struct protection_domain *domain; /* Domain the device is bound to */
- atomic_t bind; /* Domain attach reverent count */
-};
-
-/*
- * For dynamic growth the aperture size is split into ranges of 128MB of
- * DMA address space each. This struct represents one such range.
- */
-struct aperture_range {
-
- /* address allocation bitmap */
- unsigned long *bitmap;
-
- /*
- * Array of PTE pages for the aperture. In this array we save all the
- * leaf pages of the domain page table used for the aperture. This way
- * we don't need to walk the page table to find a specific PTE. We can
- * just calculate its address in constant time.
- */
- u64 *pte_pages[64];
-
- unsigned long offset;
-};
-
-/*
- * Data container for a dma_ops specific protection domain
- */
-struct dma_ops_domain {
- struct list_head list;
-
- /* generic protection domain information */
- struct protection_domain domain;
-
- /* size of the aperture for the mappings */
- unsigned long aperture_size;
-
- /* address we start to search for free addresses */
- unsigned long next_address;
-
- /* address space relevant data */
- struct aperture_range *aperture[APERTURE_MAX_RANGES];
-
- /* This will be set to true when TLB needs to be flushed */
- bool need_flush;
-
- /*
- * if this is a preallocated domain, keep the device for which it was
- * preallocated in this variable
- */
- u16 target_dev;
-};
-
-/*
- * Structure where we save information about one hardware AMD IOMMU in the
- * system.
- */
-struct amd_iommu {
- struct list_head list;
-
- /* Index within the IOMMU array */
- int index;
-
- /* locks the accesses to the hardware */
- spinlock_t lock;
-
- /* Pointer to PCI device of this IOMMU */
- struct pci_dev *dev;
-
- /* physical address of MMIO space */
- u64 mmio_phys;
- /* virtual address of MMIO space */
- u8 *mmio_base;
-
- /* capabilities of that IOMMU read from ACPI */
- u32 cap;
-
- /* flags read from acpi table */
- u8 acpi_flags;
-
- /*
- * Capability pointer. There could be more than one IOMMU per PCI
- * device function if there are more than one AMD IOMMU capability
- * pointers.
- */
- u16 cap_ptr;
-
- /* pci domain of this IOMMU */
- u16 pci_seg;
-
- /* first device this IOMMU handles. read from PCI */
- u16 first_device;
- /* last device this IOMMU handles. read from PCI */
- u16 last_device;
-
- /* start of exclusion range of that IOMMU */
- u64 exclusion_start;
- /* length of exclusion range of that IOMMU */
- u64 exclusion_length;
-
- /* command buffer virtual address */
- u8 *cmd_buf;
- /* size of command buffer */
- u32 cmd_buf_size;
-
- /* size of event buffer */
- u32 evt_buf_size;
- /* event buffer virtual address */
- u8 *evt_buf;
- /* MSI number for event interrupt */
- u16 evt_msi_num;
-
- /* true if interrupts for this IOMMU are already enabled */
- bool int_enabled;
-
- /* if one, we need to send a completion wait command */
- bool need_sync;
-
- /* becomes true if a command buffer reset is running */
- bool reset_in_progress;
-
- /* default dma_ops domain for that IOMMU */
- struct dma_ops_domain *default_dom;
-
- /*
- * We can't rely on the BIOS to restore all values on reinit, so we
- * need to stash them
- */
-
- /* The iommu BAR */
- u32 stored_addr_lo;
- u32 stored_addr_hi;
-
- /*
- * Each iommu has 6 l1s, each of which is documented as having 0x12
- * registers
- */
- u32 stored_l1[6][0x12];
-
- /* The l2 indirect registers */
- u32 stored_l2[0x83];
-};
-
-/*
- * List with all IOMMUs in the system. This list is not locked because it is
- * only written and read at driver initialization or suspend time
- */
-extern struct list_head amd_iommu_list;
-
-/*
- * Array with pointers to each IOMMU struct
- * The indices are referenced in the protection domains
- */
-extern struct amd_iommu *amd_iommus[MAX_IOMMUS];
-
-/* Number of IOMMUs present in the system */
-extern int amd_iommus_present;
-
-/*
- * Declarations for the global list of all protection domains
- */
-extern spinlock_t amd_iommu_pd_lock;
-extern struct list_head amd_iommu_pd_list;
-
-/*
- * Structure defining one entry in the device table
- */
-struct dev_table_entry {
- u32 data[8];
-};
-
-/*
- * One entry for unity mappings parsed out of the ACPI table.
- */
-struct unity_map_entry {
- struct list_head list;
-
- /* starting device id this entry is used for (including) */
- u16 devid_start;
- /* end device id this entry is used for (including) */
- u16 devid_end;
-
- /* start address to unity map (including) */
- u64 address_start;
- /* end address to unity map (including) */
- u64 address_end;
-
- /* required protection */
- int prot;
-};
-
-/*
- * List of all unity mappings. It is not locked because as runtime it is only
- * read. It is created at ACPI table parsing time.
- */
-extern struct list_head amd_iommu_unity_map;
-
-/*
- * Data structures for device handling
- */
-
-/*
- * Device table used by hardware. Read and write accesses by software are
- * locked with the amd_iommu_pd_table lock.
- */
-extern struct dev_table_entry *amd_iommu_dev_table;
-
-/*
- * Alias table to find requestor ids to device ids. Not locked because only
- * read on runtime.
- */
-extern u16 *amd_iommu_alias_table;
-
-/*
- * Reverse lookup table to find the IOMMU which translates a specific device.
- */
-extern struct amd_iommu **amd_iommu_rlookup_table;
-
-/* size of the dma_ops aperture as power of 2 */
-extern unsigned amd_iommu_aperture_order;
-
-/* largest PCI device id we expect translation requests for */
-extern u16 amd_iommu_last_bdf;
-
-/* allocation bitmap for domain ids */
-extern unsigned long *amd_iommu_pd_alloc_bitmap;
-
-/*
- * If true, the addresses will be flushed on unmap time, not when
- * they are reused
- */
-extern bool amd_iommu_unmap_flush;
-
-/* takes bus and device/function and returns the device id
- * FIXME: should that be in generic PCI code? */
-static inline u16 calc_devid(u8 bus, u8 devfn)
-{
- return (((u16)bus) << 8) | devfn;
-}
-
-#ifdef CONFIG_AMD_IOMMU_STATS
-
-struct __iommu_counter {
- char *name;
- struct dentry *dent;
- u64 value;
-};
-
-#define DECLARE_STATS_COUNTER(nm) \
- static struct __iommu_counter nm = { \
- .name = #nm, \
- }
-
-#define INC_STATS_COUNTER(name) name.value += 1
-#define ADD_STATS_COUNTER(name, x) name.value += (x)
-#define SUB_STATS_COUNTER(name, x) name.value -= (x)
-
-#else /* CONFIG_AMD_IOMMU_STATS */
-
-#define DECLARE_STATS_COUNTER(name)
-#define INC_STATS_COUNTER(name)
-#define ADD_STATS_COUNTER(name, x)
-#define SUB_STATS_COUNTER(name, x)
-
-#endif /* CONFIG_AMD_IOMMU_STATS */
-
-#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 331682231bb..67f87f25761 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -11,7 +11,6 @@ struct amd_nb_bus_dev_range {
extern const struct pci_device_id amd_nb_misc_ids[];
extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[];
-struct bootnode;
extern bool early_is_amd_nb(u32 value);
extern int amd_cache_northbridges(void);
diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h
index 2fefa501d3b..0acbac299e4 100644
--- a/arch/x86/include/asm/apb_timer.h
+++ b/arch/x86/include/asm/apb_timer.h
@@ -18,24 +18,6 @@
#ifdef CONFIG_APB_TIMER
-/* Langwell DW APB timer registers */
-#define APBTMR_N_LOAD_COUNT 0x00
-#define APBTMR_N_CURRENT_VALUE 0x04
-#define APBTMR_N_CONTROL 0x08
-#define APBTMR_N_EOI 0x0c
-#define APBTMR_N_INT_STATUS 0x10
-
-#define APBTMRS_INT_STATUS 0xa0
-#define APBTMRS_EOI 0xa4
-#define APBTMRS_RAW_INT_STATUS 0xa8
-#define APBTMRS_COMP_VERSION 0xac
-#define APBTMRS_REG_SIZE 0x14
-
-/* register bits */
-#define APBTMR_CONTROL_ENABLE (1<<0)
-#define APBTMR_CONTROL_MODE_PERIODIC (1<<1) /*1: periodic 0:free running */
-#define APBTMR_CONTROL_INT (1<<2)
-
/* default memory mapped register base */
#define LNW_SCU_ADDR 0xFF100000
#define LNW_EXT_TIMER_OFFSET 0x1B800
@@ -43,14 +25,13 @@
#define LNW_EXT_TIMER_PGOFFSET 0x800
/* APBT clock speed range from PCLK to fabric base, 25-100MHz */
-#define APBT_MAX_FREQ 50
-#define APBT_MIN_FREQ 1
+#define APBT_MAX_FREQ 50000000
+#define APBT_MIN_FREQ 1000000
#define APBT_MMAP_SIZE 1024
#define APBT_DEV_USED 1
extern void apbt_time_init(void);
-extern struct clock_event_device *global_clock_event;
extern unsigned long apbt_quick_calibrate(void);
extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu);
extern void apbt_setup_secondary_clock(void);
@@ -62,7 +43,7 @@ extern int sfi_mtimer_num;
#else /* CONFIG_APB_TIMER */
static inline unsigned long apbt_quick_calibrate(void) {return 0; }
-static inline void apbt_time_init(void) {return 0; }
+static inline void apbt_time_init(void) { }
#endif
#endif /* ASM_X86_APBT_H */
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 2b7d573be54..7b3ca8324b6 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -8,7 +8,7 @@
#include <asm/cpufeature.h>
#include <asm/processor.h>
#include <asm/apicdef.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <asm/fixmap.h>
#include <asm/mpspec.h>
#include <asm/system.h>
@@ -363,7 +363,12 @@ struct apic {
*/
int (*x86_32_early_logical_apicid)(int cpu);
- /* determine CPU -> NUMA node mapping */
+ /*
+ * Optional method called from setup_local_APIC() after logical
+ * apicid is guaranteed to be known to initialize apicid -> node
+ * mapping if NUMA initialization hasn't done so already. Don't
+ * add new users.
+ */
int (*x86_32_numa_cpu_node)(int cpu);
#endif
};
@@ -376,6 +381,26 @@ struct apic {
extern struct apic *apic;
/*
+ * APIC drivers are probed based on how they are listed in the .apicdrivers
+ * section. So the order is important and enforced by the ordering
+ * of different apic driver files in the Makefile.
+ *
+ * For the files having two apic drivers, we use apic_drivers()
+ * to enforce the order with in them.
+ */
+#define apic_driver(sym) \
+ static struct apic *__apicdrivers_##sym __used \
+ __aligned(sizeof(struct apic *)) \
+ __section(.apicdrivers) = { &sym }
+
+#define apic_drivers(sym1, sym2) \
+ static struct apic *__apicdrivers_##sym1##sym2[2] __used \
+ __aligned(sizeof(struct apic *)) \
+ __section(.apicdrivers) = { &sym1, &sym2 }
+
+extern struct apic *__apicdrivers[], *__apicdrivers_end[];
+
+/*
* APIC functionality to boot other CPUs - only used on SMP:
*/
#ifdef CONFIG_SMP
@@ -453,15 +478,10 @@ static inline unsigned default_get_apic_id(unsigned long x)
#define DEFAULT_TRAMPOLINE_PHYS_HIGH 0x469
#ifdef CONFIG_X86_64
-extern struct apic apic_flat;
-extern struct apic apic_physflat;
-extern struct apic apic_x2apic_cluster;
-extern struct apic apic_x2apic_phys;
extern int default_acpi_madt_oem_check(char *, char *);
extern void apic_send_IPI_self(int vector);
-extern struct apic apic_x2apic_uv_x;
DECLARE_PER_CPU(int, x2apic_extra_bits);
extern int default_cpu_present_to_apicid(int mps_cpu);
@@ -475,7 +495,7 @@ static inline void default_wait_for_init_deassert(atomic_t *deassert)
return;
}
-extern void generic_bigsmp_probe(void);
+extern struct apic *generic_bigsmp_probe(void);
#ifdef CONFIG_X86_LOCAL_APIC
@@ -511,8 +531,6 @@ extern struct apic apic_noop;
#ifdef CONFIG_X86_32
-extern struct apic apic_default;
-
static inline int noop_x86_32_early_logical_apicid(int cpu)
{
return BAD_APICID;
@@ -537,8 +555,6 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb)
return cpuid_apic >> index_msb;
}
-extern int default_x86_32_numa_cpu_node(int cpu);
-
#endif
static inline unsigned int
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index d87988bacf3..34595d5e103 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -78,6 +78,7 @@
#define APIC_DEST_LOGICAL 0x00800
#define APIC_DEST_PHYSICAL 0x00000
#define APIC_DM_FIXED 0x00000
+#define APIC_DM_FIXED_MASK 0x00700
#define APIC_DM_LOWEST 0x00100
#define APIC_DM_SMI 0x00200
#define APIC_DM_REMRD 0x00300
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index b3ed1e1460f..9412d6558c8 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -3,9 +3,11 @@
#ifdef __ASSEMBLY__
# define __ASM_FORM(x) x
+# define __ASM_FORM_COMMA(x) x,
# define __ASM_EX_SEC .section __ex_table, "a"
#else
# define __ASM_FORM(x) " " #x " "
+# define __ASM_FORM_COMMA(x) " " #x ","
# define __ASM_EX_SEC " .section __ex_table,\"a\"\n"
#endif
@@ -15,7 +17,8 @@
# define __ASM_SEL(a,b) __ASM_FORM(b)
#endif
-#define __ASM_SIZE(inst) __ASM_SEL(inst##l, inst##q)
+#define __ASM_SIZE(inst, ...) __ASM_SEL(inst##l##__VA_ARGS__, \
+ inst##q##__VA_ARGS__)
#define __ASM_REG(reg) __ASM_SEL(e##reg, r##reg)
#define _ASM_PTR __ASM_SEL(.long, .quad)
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 952a826ac4e..10572e309ab 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -221,15 +221,15 @@ static inline int atomic_xchg(atomic_t *v, int new)
}
/**
- * atomic_add_unless - add unless the number is already a given value
+ * __atomic_add_unless - add unless the number is already a given value
* @v: pointer of type atomic_t
* @a: the amount to add to v...
* @u: ...unless v is equal to u.
*
* Atomically adds @a to @v, so long as @v was not already @u.
- * Returns non-zero if @v was not @u, and zero otherwise.
+ * Returns the old value of @v.
*/
-static inline int atomic_add_unless(atomic_t *v, int a, int u)
+static inline int __atomic_add_unless(atomic_t *v, int a, int u)
{
int c, old;
c = atomic_read(v);
@@ -241,10 +241,9 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
break;
c = old;
}
- return c != (u);
+ return c;
}
-#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
/*
* atomic_dec_if_positive - decrement by 1 if old value positive
@@ -319,5 +318,4 @@ static inline void atomic_or_long(unsigned long *v1, unsigned long v2)
# include "atomic64_64.h"
#endif
-#include <asm-generic/atomic-long.h>
#endif /* _ASM_X86_ATOMIC_H */
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 2a934aa19a4..24098aafce0 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -263,7 +263,7 @@ static inline int atomic64_add_negative(long long i, atomic64_t *v)
* @u: ...unless v is equal to u.
*
* Atomically adds @a to @v, so long as it was not @u.
- * Returns non-zero if @v was not @u, and zero otherwise.
+ * Returns the old value of @v.
*/
static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u)
{
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 49fd1ea2295..017594d403f 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -202,7 +202,7 @@ static inline long atomic64_xchg(atomic64_t *v, long new)
* @u: ...unless v is equal to u.
*
* Atomically adds @a to @v, so long as it was not @u.
- * Returns non-zero if @v was not @u, and zero otherwise.
+ * Returns the old value of @v.
*/
static inline int atomic64_add_unless(atomic64_t *v, long a, long u)
{
diff --git a/arch/x86/include/asm/bios_ebda.h b/arch/x86/include/asm/bios_ebda.h
index 3c7521063d3..aa6a3170ab5 100644
--- a/arch/x86/include/asm/bios_ebda.h
+++ b/arch/x86/include/asm/bios_ebda.h
@@ -4,16 +4,40 @@
#include <asm/io.h>
/*
- * there is a real-mode segmented pointer pointing to the
- * 4K EBDA area at 0x40E.
+ * Returns physical address of EBDA. Returns 0 if there is no EBDA.
*/
static inline unsigned int get_bios_ebda(void)
{
+ /*
+ * There is a real-mode segmented pointer pointing to the
+ * 4K EBDA area at 0x40E.
+ */
unsigned int address = *(unsigned short *)phys_to_virt(0x40E);
address <<= 4;
return address; /* 0 means none */
}
+/*
+ * Return the sanitized length of the EBDA in bytes, if it exists.
+ */
+static inline unsigned int get_bios_ebda_length(void)
+{
+ unsigned int address;
+ unsigned int length;
+
+ address = get_bios_ebda();
+ if (!address)
+ return 0;
+
+ /* EBDA length is byte 0 of the EBDA (stored in KiB) */
+ length = *(unsigned char *)phys_to_virt(address);
+ length <<= 10;
+
+ /* Trim the length if it extends beyond 640KiB */
+ length = min_t(unsigned int, (640 * 1024) - address, length);
+ return length;
+}
+
void reserve_ebda_region(void);
#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 69d58131bc8..1775d6e5920 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -458,10 +458,7 @@ static inline int fls(int x)
#include <asm-generic/bitops/le.h>
-#define ext2_set_bit_atomic(lock, nr, addr) \
- test_and_set_bit((nr), (unsigned long *)(addr))
-#define ext2_clear_bit_atomic(lock, nr, addr) \
- test_and_clear_bit((nr), (unsigned long *)(addr))
+#include <asm-generic/bitops/ext2-atomic-setbit.h>
#endif /* __KERNEL__ */
#endif /* _ASM_X86_BITOPS_H */
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index 30af5a83216..a9e3a740f69 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -46,6 +46,7 @@ For 32-bit we have the following conventions - kernel is built with
*/
+#include "dwarf2.h"
/*
* 64-bit system call stack frame layout defines and helpers, for
@@ -84,72 +85,57 @@ For 32-bit we have the following conventions - kernel is built with
#define ARGOFFSET R11
#define SWFRAME ORIG_RAX
- .macro SAVE_ARGS addskip=0, norcx=0, nor891011=0
+ .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1
subq $9*8+\addskip, %rsp
CFI_ADJUST_CFA_OFFSET 9*8+\addskip
- movq %rdi, 8*8(%rsp)
- CFI_REL_OFFSET rdi, 8*8
- movq %rsi, 7*8(%rsp)
- CFI_REL_OFFSET rsi, 7*8
- movq %rdx, 6*8(%rsp)
- CFI_REL_OFFSET rdx, 6*8
- .if \norcx
- .else
- movq %rcx, 5*8(%rsp)
- CFI_REL_OFFSET rcx, 5*8
+ movq_cfi rdi, 8*8
+ movq_cfi rsi, 7*8
+ movq_cfi rdx, 6*8
+
+ .if \save_rcx
+ movq_cfi rcx, 5*8
.endif
- movq %rax, 4*8(%rsp)
- CFI_REL_OFFSET rax, 4*8
- .if \nor891011
- .else
- movq %r8, 3*8(%rsp)
- CFI_REL_OFFSET r8, 3*8
- movq %r9, 2*8(%rsp)
- CFI_REL_OFFSET r9, 2*8
- movq %r10, 1*8(%rsp)
- CFI_REL_OFFSET r10, 1*8
- movq %r11, (%rsp)
- CFI_REL_OFFSET r11, 0*8
+
+ movq_cfi rax, 4*8
+
+ .if \save_r891011
+ movq_cfi r8, 3*8
+ movq_cfi r9, 2*8
+ movq_cfi r10, 1*8
+ movq_cfi r11, 0*8
.endif
+
.endm
#define ARG_SKIP (9*8)
- .macro RESTORE_ARGS skiprax=0, addskip=0, skiprcx=0, skipr11=0, \
- skipr8910=0, skiprdx=0
- .if \skipr11
- .else
- movq (%rsp), %r11
- CFI_RESTORE r11
+ .macro RESTORE_ARGS rstor_rax=1, addskip=0, rstor_rcx=1, rstor_r11=1, \
+ rstor_r8910=1, rstor_rdx=1
+ .if \rstor_r11
+ movq_cfi_restore 0*8, r11
.endif
- .if \skipr8910
- .else
- movq 1*8(%rsp), %r10
- CFI_RESTORE r10
- movq 2*8(%rsp), %r9
- CFI_RESTORE r9
- movq 3*8(%rsp), %r8
- CFI_RESTORE r8
+
+ .if \rstor_r8910
+ movq_cfi_restore 1*8, r10
+ movq_cfi_restore 2*8, r9
+ movq_cfi_restore 3*8, r8
.endif
- .if \skiprax
- .else
- movq 4*8(%rsp), %rax
- CFI_RESTORE rax
+
+ .if \rstor_rax
+ movq_cfi_restore 4*8, rax
.endif
- .if \skiprcx
- .else
- movq 5*8(%rsp), %rcx
- CFI_RESTORE rcx
+
+ .if \rstor_rcx
+ movq_cfi_restore 5*8, rcx
.endif
- .if \skiprdx
- .else
- movq 6*8(%rsp), %rdx
- CFI_RESTORE rdx
+
+ .if \rstor_rdx
+ movq_cfi_restore 6*8, rdx
.endif
- movq 7*8(%rsp), %rsi
- CFI_RESTORE rsi
- movq 8*8(%rsp), %rdi
- CFI_RESTORE rdi
+
+ movq_cfi_restore 7*8, rsi
+ movq_cfi_restore 8*8, rdi
+
.if ARG_SKIP+\addskip > 0
addq $ARG_SKIP+\addskip, %rsp
CFI_ADJUST_CFA_OFFSET -(ARG_SKIP+\addskip)
@@ -176,33 +162,21 @@ For 32-bit we have the following conventions - kernel is built with
.macro SAVE_REST
subq $REST_SKIP, %rsp
CFI_ADJUST_CFA_OFFSET REST_SKIP
- movq %rbx, 5*8(%rsp)
- CFI_REL_OFFSET rbx, 5*8
- movq %rbp, 4*8(%rsp)
- CFI_REL_OFFSET rbp, 4*8
- movq %r12, 3*8(%rsp)
- CFI_REL_OFFSET r12, 3*8
- movq %r13, 2*8(%rsp)
- CFI_REL_OFFSET r13, 2*8
- movq %r14, 1*8(%rsp)
- CFI_REL_OFFSET r14, 1*8
- movq %r15, (%rsp)
- CFI_REL_OFFSET r15, 0*8
+ movq_cfi rbx, 5*8
+ movq_cfi rbp, 4*8
+ movq_cfi r12, 3*8
+ movq_cfi r13, 2*8
+ movq_cfi r14, 1*8
+ movq_cfi r15, 0*8
.endm
.macro RESTORE_REST
- movq (%rsp), %r15
- CFI_RESTORE r15
- movq 1*8(%rsp), %r14
- CFI_RESTORE r14
- movq 2*8(%rsp), %r13
- CFI_RESTORE r13
- movq 3*8(%rsp), %r12
- CFI_RESTORE r12
- movq 4*8(%rsp), %rbp
- CFI_RESTORE rbp
- movq 5*8(%rsp), %rbx
- CFI_RESTORE rbx
+ movq_cfi_restore 0*8, r15
+ movq_cfi_restore 1*8, r14
+ movq_cfi_restore 2*8, r13
+ movq_cfi_restore 3*8, r12
+ movq_cfi_restore 4*8, rbp
+ movq_cfi_restore 5*8, rbx
addq $REST_SKIP, %rsp
CFI_ADJUST_CFA_OFFSET -(REST_SKIP)
.endm
@@ -214,7 +188,7 @@ For 32-bit we have the following conventions - kernel is built with
.macro RESTORE_ALL addskip=0
RESTORE_REST
- RESTORE_ARGS 0, \addskip
+ RESTORE_ARGS 1, \addskip
.endm
.macro icebp
diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h
new file mode 100644
index 00000000000..0bdbbb3b9ce
--- /dev/null
+++ b/arch/x86/include/asm/clocksource.h
@@ -0,0 +1,18 @@
+/* x86-specific clocksource additions */
+
+#ifndef _ASM_X86_CLOCKSOURCE_H
+#define _ASM_X86_CLOCKSOURCE_H
+
+#ifdef CONFIG_X86_64
+
+#define VCLOCK_NONE 0 /* No vDSO clock available. */
+#define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */
+#define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */
+
+struct arch_clocksource_data {
+ int vclock_mode;
+};
+
+#endif /* CONFIG_X86_64 */
+
+#endif /* _ASM_X86_CLOCKSOURCE_H */
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index 284a6e8f7ce..3deb7250624 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -280,4 +280,52 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
#endif
+#define cmpxchg8b(ptr, o1, o2, n1, n2) \
+({ \
+ char __ret; \
+ __typeof__(o2) __dummy; \
+ __typeof__(*(ptr)) __old1 = (o1); \
+ __typeof__(o2) __old2 = (o2); \
+ __typeof__(*(ptr)) __new1 = (n1); \
+ __typeof__(o2) __new2 = (n2); \
+ asm volatile(LOCK_PREFIX "cmpxchg8b %2; setz %1" \
+ : "=d"(__dummy), "=a" (__ret), "+m" (*ptr)\
+ : "a" (__old1), "d"(__old2), \
+ "b" (__new1), "c" (__new2) \
+ : "memory"); \
+ __ret; })
+
+
+#define cmpxchg8b_local(ptr, o1, o2, n1, n2) \
+({ \
+ char __ret; \
+ __typeof__(o2) __dummy; \
+ __typeof__(*(ptr)) __old1 = (o1); \
+ __typeof__(o2) __old2 = (o2); \
+ __typeof__(*(ptr)) __new1 = (n1); \
+ __typeof__(o2) __new2 = (n2); \
+ asm volatile("cmpxchg8b %2; setz %1" \
+ : "=d"(__dummy), "=a"(__ret), "+m" (*ptr)\
+ : "a" (__old), "d"(__old2), \
+ "b" (__new1), "c" (__new2), \
+ : "memory"); \
+ __ret; })
+
+
+#define cmpxchg_double(ptr, o1, o2, n1, n2) \
+({ \
+ BUILD_BUG_ON(sizeof(*(ptr)) != 4); \
+ VM_BUG_ON((unsigned long)(ptr) % 8); \
+ cmpxchg8b((ptr), (o1), (o2), (n1), (n2)); \
+})
+
+#define cmpxchg_double_local(ptr, o1, o2, n1, n2) \
+({ \
+ BUILD_BUG_ON(sizeof(*(ptr)) != 4); \
+ VM_BUG_ON((unsigned long)(ptr) % 8); \
+ cmpxchg16b_local((ptr), (o1), (o2), (n1), (n2)); \
+})
+
+#define system_has_cmpxchg_double() cpu_has_cx8
+
#endif /* _ASM_X86_CMPXCHG_32_H */
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 423ae58aa02..7cf5c0a2443 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -151,4 +151,49 @@ extern void __cmpxchg_wrong_size(void);
cmpxchg_local((ptr), (o), (n)); \
})
+#define cmpxchg16b(ptr, o1, o2, n1, n2) \
+({ \
+ char __ret; \
+ __typeof__(o2) __junk; \
+ __typeof__(*(ptr)) __old1 = (o1); \
+ __typeof__(o2) __old2 = (o2); \
+ __typeof__(*(ptr)) __new1 = (n1); \
+ __typeof__(o2) __new2 = (n2); \
+ asm volatile(LOCK_PREFIX "cmpxchg16b %2;setz %1" \
+ : "=d"(__junk), "=a"(__ret), "+m" (*ptr) \
+ : "b"(__new1), "c"(__new2), \
+ "a"(__old1), "d"(__old2)); \
+ __ret; })
+
+
+#define cmpxchg16b_local(ptr, o1, o2, n1, n2) \
+({ \
+ char __ret; \
+ __typeof__(o2) __junk; \
+ __typeof__(*(ptr)) __old1 = (o1); \
+ __typeof__(o2) __old2 = (o2); \
+ __typeof__(*(ptr)) __new1 = (n1); \
+ __typeof__(o2) __new2 = (n2); \
+ asm volatile("cmpxchg16b %2;setz %1" \
+ : "=d"(__junk), "=a"(__ret), "+m" (*ptr) \
+ : "b"(__new1), "c"(__new2), \
+ "a"(__old1), "d"(__old2)); \
+ __ret; })
+
+#define cmpxchg_double(ptr, o1, o2, n1, n2) \
+({ \
+ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
+ VM_BUG_ON((unsigned long)(ptr) % 16); \
+ cmpxchg16b((ptr), (o1), (o2), (n1), (n2)); \
+})
+
+#define cmpxchg_double_local(ptr, o1, o2, n1, n2) \
+({ \
+ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
+ VM_BUG_ON((unsigned long)(ptr) % 16); \
+ cmpxchg16b_local((ptr), (o1), (o2), (n1), (n2)); \
+})
+
+#define system_has_cmpxchg_double() cpu_has_cx16
+
#endif /* _ASM_X86_CMPXCHG_64_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 91f3e087cf2..4258aac99a6 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -125,7 +125,7 @@
#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */
#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */
#define X86_FEATURE_F16C (4*32+29) /* 16-bit fp conversions */
-#define X86_FEATURE_RDRND (4*32+30) /* The RDRAND instruction */
+#define X86_FEATURE_RDRAND (4*32+30) /* The RDRAND instruction */
#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */
/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
@@ -195,6 +195,8 @@
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
+#define X86_FEATURE_SMEP (9*32+ 7) /* Supervisor Mode Execution Protection */
+#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */
#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
@@ -207,8 +209,7 @@ extern const char * const x86_power_flags[32];
#define test_cpu_cap(c, bit) \
test_bit(bit, (unsigned long *)((c)->x86_capability))
-#define cpu_has(c, bit) \
- (__builtin_constant_p(bit) && \
+#define REQUIRED_MASK_BIT_SET(bit) \
( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) || \
(((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) || \
(((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) || \
@@ -218,10 +219,16 @@ extern const char * const x86_power_flags[32];
(((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \
(((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) || \
(((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) || \
- (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) ) \
- ? 1 : \
+ (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) )
+
+#define cpu_has(c, bit) \
+ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
test_cpu_cap(c, bit))
+#define this_cpu_has(bit) \
+ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
+ x86_this_cpu_test_bit(bit, (unsigned long *)&cpu_info.x86_capability))
+
#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit)
#define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability))
@@ -281,6 +288,8 @@ extern const char * const x86_power_flags[32];
#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ)
#define cpu_has_perfctr_core boot_cpu_has(X86_FEATURE_PERFCTR_CORE)
+#define cpu_has_cx8 boot_cpu_has(X86_FEATURE_CX8)
+#define cpu_has_cx16 boot_cpu_has(X86_FEATURE_CX16)
#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
# define cpu_has_invlpg 1
@@ -324,8 +333,8 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
"2:\n"
".section .altinstructions,\"a\"\n"
_ASM_ALIGN "\n"
- _ASM_PTR "1b\n"
- _ASM_PTR "0\n" /* no replacement */
+ " .long 1b - .\n"
+ " .long 0\n" /* no replacement */
" .word %P0\n" /* feature bit */
" .byte 2b - 1b\n" /* source len */
" .byte 0\n" /* replacement len */
@@ -342,8 +351,8 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
"2:\n"
".section .altinstructions,\"a\"\n"
_ASM_ALIGN "\n"
- _ASM_PTR "1b\n"
- _ASM_PTR "3f\n"
+ " .long 1b - .\n"
+ " .long 3f - .\n"
" .word %P1\n" /* feature bit */
" .byte 2b - 1b\n" /* source len */
" .byte 4f - 3f\n" /* replacement len */
diff --git a/arch/x86/include/asm/delay.h b/arch/x86/include/asm/delay.h
index 409a649204a..9b3b4f2754c 100644
--- a/arch/x86/include/asm/delay.h
+++ b/arch/x86/include/asm/delay.h
@@ -1,30 +1,7 @@
#ifndef _ASM_X86_DELAY_H
#define _ASM_X86_DELAY_H
-/*
- * Copyright (C) 1993 Linus Torvalds
- *
- * Delay routines calling functions in arch/x86/lib/delay.c
- */
-
-/* Undefined functions to get compile-time errors */
-extern void __bad_udelay(void);
-extern void __bad_ndelay(void);
-
-extern void __udelay(unsigned long usecs);
-extern void __ndelay(unsigned long nsecs);
-extern void __const_udelay(unsigned long xloops);
-extern void __delay(unsigned long loops);
-
-/* 0x10c7 is 2**32 / 1000000 (rounded up) */
-#define udelay(n) (__builtin_constant_p(n) ? \
- ((n) > 20000 ? __bad_udelay() : __const_udelay((n) * 0x10c7ul)) : \
- __udelay(n))
-
-/* 0x5 is 2**32 / 1000000000 (rounded up) */
-#define ndelay(n) (__builtin_constant_p(n) ? \
- ((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \
- __ndelay(n))
+#include <asm-generic/delay.h>
void use_tsc_delay(void);
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 617bd56b307..7b439d9aea2 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -4,30 +4,33 @@
#include <asm/desc_defs.h>
#include <asm/ldt.h>
#include <asm/mmu.h>
+
#include <linux/smp.h>
-static inline void fill_ldt(struct desc_struct *desc,
- const struct user_desc *info)
-{
- desc->limit0 = info->limit & 0x0ffff;
- desc->base0 = info->base_addr & 0x0000ffff;
-
- desc->base1 = (info->base_addr & 0x00ff0000) >> 16;
- desc->type = (info->read_exec_only ^ 1) << 1;
- desc->type |= info->contents << 2;
- desc->s = 1;
- desc->dpl = 0x3;
- desc->p = info->seg_not_present ^ 1;
- desc->limit = (info->limit & 0xf0000) >> 16;
- desc->avl = info->useable;
- desc->d = info->seg_32bit;
- desc->g = info->limit_in_pages;
- desc->base2 = (info->base_addr & 0xff000000) >> 24;
+static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *info)
+{
+ desc->limit0 = info->limit & 0x0ffff;
+
+ desc->base0 = (info->base_addr & 0x0000ffff);
+ desc->base1 = (info->base_addr & 0x00ff0000) >> 16;
+
+ desc->type = (info->read_exec_only ^ 1) << 1;
+ desc->type |= info->contents << 2;
+
+ desc->s = 1;
+ desc->dpl = 0x3;
+ desc->p = info->seg_not_present ^ 1;
+ desc->limit = (info->limit & 0xf0000) >> 16;
+ desc->avl = info->useable;
+ desc->d = info->seg_32bit;
+ desc->g = info->limit_in_pages;
+
+ desc->base2 = (info->base_addr & 0xff000000) >> 24;
/*
* Don't allow setting of the lm bit. It is useless anyway
* because 64bit system calls require __USER_CS:
*/
- desc->l = 0;
+ desc->l = 0;
}
extern struct desc_ptr idt_descr;
@@ -36,6 +39,7 @@ extern gate_desc idt_table[];
struct gdt_page {
struct desc_struct gdt[GDT_ENTRIES];
} __attribute__((aligned(PAGE_SIZE)));
+
DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
@@ -48,16 +52,16 @@ static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
unsigned dpl, unsigned ist, unsigned seg)
{
- gate->offset_low = PTR_LOW(func);
- gate->segment = __KERNEL_CS;
- gate->ist = ist;
- gate->p = 1;
- gate->dpl = dpl;
- gate->zero0 = 0;
- gate->zero1 = 0;
- gate->type = type;
- gate->offset_middle = PTR_MIDDLE(func);
- gate->offset_high = PTR_HIGH(func);
+ gate->offset_low = PTR_LOW(func);
+ gate->segment = __KERNEL_CS;
+ gate->ist = ist;
+ gate->p = 1;
+ gate->dpl = dpl;
+ gate->zero0 = 0;
+ gate->zero1 = 0;
+ gate->type = type;
+ gate->offset_middle = PTR_MIDDLE(func);
+ gate->offset_high = PTR_HIGH(func);
}
#else
@@ -66,8 +70,7 @@ static inline void pack_gate(gate_desc *gate, unsigned char type,
unsigned short seg)
{
gate->a = (seg << 16) | (base & 0xffff);
- gate->b = (base & 0xffff0000) |
- (((0x80 | type | (dpl << 5)) & 0xff) << 8);
+ gate->b = (base & 0xffff0000) | (((0x80 | type | (dpl << 5)) & 0xff) << 8);
}
#endif
@@ -75,31 +78,29 @@ static inline void pack_gate(gate_desc *gate, unsigned char type,
static inline int desc_empty(const void *ptr)
{
const u32 *desc = ptr;
+
return !(desc[0] | desc[1]);
}
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else
-#define load_TR_desc() native_load_tr_desc()
-#define load_gdt(dtr) native_load_gdt(dtr)
-#define load_idt(dtr) native_load_idt(dtr)
-#define load_tr(tr) asm volatile("ltr %0"::"m" (tr))
-#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt))
-
-#define store_gdt(dtr) native_store_gdt(dtr)
-#define store_idt(dtr) native_store_idt(dtr)
-#define store_tr(tr) (tr = native_store_tr())
-
-#define load_TLS(t, cpu) native_load_tls(t, cpu)
-#define set_ldt native_set_ldt
-
-#define write_ldt_entry(dt, entry, desc) \
- native_write_ldt_entry(dt, entry, desc)
-#define write_gdt_entry(dt, entry, desc, type) \
- native_write_gdt_entry(dt, entry, desc, type)
-#define write_idt_entry(dt, entry, g) \
- native_write_idt_entry(dt, entry, g)
+#define load_TR_desc() native_load_tr_desc()
+#define load_gdt(dtr) native_load_gdt(dtr)
+#define load_idt(dtr) native_load_idt(dtr)
+#define load_tr(tr) asm volatile("ltr %0"::"m" (tr))
+#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt))
+
+#define store_gdt(dtr) native_store_gdt(dtr)
+#define store_idt(dtr) native_store_idt(dtr)
+#define store_tr(tr) (tr = native_store_tr())
+
+#define load_TLS(t, cpu) native_load_tls(t, cpu)
+#define set_ldt native_set_ldt
+
+#define write_ldt_entry(dt, entry, desc) native_write_ldt_entry(dt, entry, desc)
+#define write_gdt_entry(dt, entry, desc, type) native_write_gdt_entry(dt, entry, desc, type)
+#define write_idt_entry(dt, entry, g) native_write_idt_entry(dt, entry, g)
static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
{
@@ -112,33 +113,27 @@ static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
#define store_ldt(ldt) asm("sldt %0" : "=m"(ldt))
-static inline void native_write_idt_entry(gate_desc *idt, int entry,
- const gate_desc *gate)
+static inline void native_write_idt_entry(gate_desc *idt, int entry, const gate_desc *gate)
{
memcpy(&idt[entry], gate, sizeof(*gate));
}
-static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry,
- const void *desc)
+static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry, const void *desc)
{
memcpy(&ldt[entry], desc, 8);
}
-static inline void native_write_gdt_entry(struct desc_struct *gdt, int entry,
- const void *desc, int type)
+static inline void
+native_write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc, int type)
{
unsigned int size;
+
switch (type) {
- case DESC_TSS:
- size = sizeof(tss_desc);
- break;
- case DESC_LDT:
- size = sizeof(ldt_desc);
- break;
- default:
- size = sizeof(struct desc_struct);
- break;
+ case DESC_TSS: size = sizeof(tss_desc); break;
+ case DESC_LDT: size = sizeof(ldt_desc); break;
+ default: size = sizeof(*gdt); break;
}
+
memcpy(&gdt[entry], desc, size);
}
@@ -154,20 +149,21 @@ static inline void pack_descriptor(struct desc_struct *desc, unsigned long base,
}
-static inline void set_tssldt_descriptor(void *d, unsigned long addr,
- unsigned type, unsigned size)
+static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned type, unsigned size)
{
#ifdef CONFIG_X86_64
struct ldttss_desc64 *desc = d;
+
memset(desc, 0, sizeof(*desc));
- desc->limit0 = size & 0xFFFF;
- desc->base0 = PTR_LOW(addr);
- desc->base1 = PTR_MIDDLE(addr) & 0xFF;
- desc->type = type;
- desc->p = 1;
- desc->limit1 = (size >> 16) & 0xF;
- desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF;
- desc->base3 = PTR_HIGH(addr);
+
+ desc->limit0 = size & 0xFFFF;
+ desc->base0 = PTR_LOW(addr);
+ desc->base1 = PTR_MIDDLE(addr) & 0xFF;
+ desc->type = type;
+ desc->p = 1;
+ desc->limit1 = (size >> 16) & 0xF;
+ desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF;
+ desc->base3 = PTR_HIGH(addr);
#else
pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
#endif
@@ -237,14 +233,16 @@ static inline void native_store_idt(struct desc_ptr *dtr)
static inline unsigned long native_store_tr(void)
{
unsigned long tr;
+
asm volatile("str %0":"=r" (tr));
+
return tr;
}
static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
{
- unsigned int i;
struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+ unsigned int i;
for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
@@ -313,6 +311,7 @@ static inline void _set_gate(int gate, unsigned type, void *addr,
unsigned dpl, unsigned ist, unsigned seg)
{
gate_desc s;
+
pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
/*
* does not need to be atomic because it is only done once at
@@ -343,8 +342,9 @@ static inline void alloc_system_vector(int vector)
set_bit(vector, used_vectors);
if (first_system_vector > vector)
first_system_vector = vector;
- } else
+ } else {
BUG();
+ }
}
static inline void alloc_intr_gate(unsigned int n, void *addr)
diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h
index 057099e5fab..0bdb0c54d9a 100644
--- a/arch/x86/include/asm/dma.h
+++ b/arch/x86/include/asm/dma.h
@@ -69,22 +69,18 @@
#define MAX_DMA_CHANNELS 8
-#ifdef CONFIG_X86_32
-
-/* The maximum address that we can perform a DMA transfer to on this platform */
-#define MAX_DMA_ADDRESS (PAGE_OFFSET + 0x1000000)
-
-#else
-
/* 16MB ISA DMA zone */
#define MAX_DMA_PFN ((16 * 1024 * 1024) >> PAGE_SHIFT)
/* 4GB broken PCI/AGP hardware bus master zone */
#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT)
+#ifdef CONFIG_X86_32
+/* The maximum address that we can perform a DMA transfer to on this platform */
+#define MAX_DMA_ADDRESS (PAGE_OFFSET + 0x1000000)
+#else
/* Compat define for old dma zone */
#define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN << PAGE_SHIFT))
-
#endif
/* 8237 DMA controllers */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 8e4a16508d4..7093e4a6a0b 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -90,6 +90,7 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
#endif /* CONFIG_X86_32 */
extern int add_efi_memmap;
+extern void efi_set_executable(efi_memory_desc_t *md, bool executable);
extern void efi_memblock_x86_reserve_range(void);
extern void efi_call_phys_prelog(void);
extern void efi_call_phys_epilog(void);
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 1cd6d26a0a8..0baa628e330 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -53,8 +53,4 @@ BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR)
#endif
-#ifdef CONFIG_X86_MCE
-BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR)
-#endif
-
#endif
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 4729b2b6311..460c74e4852 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -78,6 +78,7 @@ enum fixed_addresses {
VSYSCALL_LAST_PAGE,
VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
+ ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
+ VVAR_PAGE,
VSYSCALL_HPET,
#endif
FIX_DBGP_BASE,
diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h
index 2c6fc9e6281..3b629f47eb6 100644
--- a/arch/x86/include/asm/frame.h
+++ b/arch/x86/include/asm/frame.h
@@ -1,5 +1,6 @@
#ifdef __ASSEMBLY__
+#include <asm/asm.h>
#include <asm/dwarf2.h>
/* The annotation hides the frame from the unwinder and makes it look
@@ -7,13 +8,13 @@
frame pointer later */
#ifdef CONFIG_FRAME_POINTER
.macro FRAME
- pushl_cfi %ebp
- CFI_REL_OFFSET ebp,0
- movl %esp,%ebp
+ __ASM_SIZE(push,_cfi) %__ASM_REG(bp)
+ CFI_REL_OFFSET __ASM_REG(bp), 0
+ __ASM_SIZE(mov) %__ASM_REG(sp), %__ASM_REG(bp)
.endm
.macro ENDFRAME
- popl_cfi %ebp
- CFI_RESTORE ebp
+ __ASM_SIZE(pop,_cfi) %__ASM_REG(bp)
+ CFI_RESTORE __ASM_REG(bp)
.endm
#else
.macro FRAME
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index db24c2278be..268c783ab1c 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -38,11 +38,10 @@ extern void mcount(void);
static inline unsigned long ftrace_call_adjust(unsigned long addr)
{
/*
- * call mcount is "e8 <4 byte offset>"
- * The addr points to the 4 byte offset and the caller of this
- * function wants the pointer to e8. Simply subtract one.
+ * addr is the address of the mcount call instruction.
+ * recordmcount does the necessary offset calculation.
*/
- return addr - 1;
+ return addr;
}
#ifdef CONFIG_DYNAMIC_FTRACE
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
index 43085bfc99c..156cd5d18d2 100644
--- a/arch/x86/include/asm/gart.h
+++ b/arch/x86/include/asm/gart.h
@@ -66,7 +66,7 @@ static inline void gart_set_size_and_enable(struct pci_dev *dev, u32 order)
* Don't enable translation but enable GART IO and CPU accesses.
* Also, set DISTLBWALKPRB since GART tables memory is UC.
*/
- ctl = DISTLBWALKPRB | order << 1;
+ ctl = order << 1;
pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
}
@@ -75,17 +75,17 @@ static inline void enable_gart_translation(struct pci_dev *dev, u64 addr)
{
u32 tmp, ctl;
- /* address of the mappings table */
- addr >>= 12;
- tmp = (u32) addr<<4;
- tmp &= ~0xf;
- pci_write_config_dword(dev, AMD64_GARTTABLEBASE, tmp);
-
- /* Enable GART translation for this hammer. */
- pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
- ctl |= GARTEN;
- ctl &= ~(DISGARTCPU | DISGARTIO);
- pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
+ /* address of the mappings table */
+ addr >>= 12;
+ tmp = (u32) addr<<4;
+ tmp &= ~0xf;
+ pci_write_config_dword(dev, AMD64_GARTTABLEBASE, tmp);
+
+ /* Enable GART translation for this hammer. */
+ pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
+ ctl |= GARTEN | DISTLBWALKPRB;
+ ctl &= ~(DISGARTCPU | DISGARTIO);
+ pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
}
static inline int aperture_valid(u64 aper_base, u32 aper_size, u32 min_size)
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index bb9efe8706e..09199052060 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -21,7 +21,7 @@
#include <linux/profile.h>
#include <linux/smp.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <asm/irq.h>
#include <asm/sections.h>
@@ -34,7 +34,6 @@ extern void irq_work_interrupt(void);
extern void spurious_interrupt(void);
extern void thermal_interrupt(void);
extern void reschedule_interrupt(void);
-extern void mce_self_interrupt(void);
extern void invalidate_interrupt(void);
extern void invalidate_interrupt0(void);
diff --git a/arch/x86/include/asm/i8253.h b/arch/x86/include/asm/i8253.h
deleted file mode 100644
index fc1f579fb96..00000000000
--- a/arch/x86/include/asm/i8253.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef _ASM_X86_I8253_H
-#define _ASM_X86_I8253_H
-
-/* i8253A PIT registers */
-#define PIT_MODE 0x43
-#define PIT_CH0 0x40
-#define PIT_CH2 0x42
-
-extern raw_spinlock_t i8253_lock;
-
-extern struct clock_event_device *global_clock_event;
-
-extern void setup_pit_timer(void);
-
-#define inb_pit inb_p
-#define outb_pit outb_p
-
-#endif /* _ASM_X86_I8253_H */
diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h
index 38d87379e27..f49253d7571 100644
--- a/arch/x86/include/asm/idle.h
+++ b/arch/x86/include/asm/idle.h
@@ -16,6 +16,6 @@ static inline void enter_idle(void) { }
static inline void exit_idle(void) { }
#endif /* CONFIG_X86_64 */
-void c1e_remove_cpu(int cpu);
+void amd_e400_remove_cpu(int cpu);
#endif /* _ASM_X86_IDLE_H */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 07227308252..d02804d650c 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -38,7 +38,6 @@
#include <linux/string.h>
#include <linux/compiler.h>
-#include <asm-generic/int-ll64.h>
#include <asm/page.h>
#include <xen/xen.h>
@@ -87,27 +86,6 @@ build_mmio_write(__writel, "l", unsigned int, "r", )
build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
-#else
-
-static inline __u64 readq(const volatile void __iomem *addr)
-{
- const volatile u32 __iomem *p = addr;
- u32 low, high;
-
- low = readl(p);
- high = readl(p + 1);
-
- return low + ((u64)high << 32);
-}
-
-static inline void writeq(__u64 val, volatile void __iomem *addr)
-{
- writel(val, addr);
- writel(val >> 32, addr+4);
-}
-
-#endif
-
#define readq_relaxed(a) readq(a)
#define __raw_readq(a) readq(a)
@@ -117,6 +95,8 @@ static inline void writeq(__u64 val, volatile void __iomem *addr)
#define readq readq
#define writeq writeq
+#endif
+
/**
* virt_to_phys - map virtual addresses to physical
* @address: address to remap
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index c4bd267dfc5..690d1cc9a87 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -105,12 +105,12 @@ struct IR_IO_APIC_route_entry {
* # of IO-APICs and # of IRQ routing registers
*/
extern int nr_ioapics;
-extern int nr_ioapic_registers[MAX_IO_APICS];
-#define MP_MAX_IOAPIC_PIN 127
+extern int mpc_ioapic_id(int ioapic);
+extern unsigned int mpc_ioapic_addr(int ioapic);
+extern struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic);
-/* I/O APIC entries */
-extern struct mpc_ioapic mp_ioapics[MAX_IO_APICS];
+#define MP_MAX_IOAPIC_PIN 127
/* # of MP IRQ source entries */
extern int mp_irq_entries;
@@ -150,13 +150,11 @@ void setup_IO_APIC_irq_extra(u32 gsi);
extern void ioapic_and_gsi_init(void);
extern void ioapic_insert_resources(void);
-int io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr);
+int io_apic_setup_irq_pin_once(unsigned int irq, int node, struct io_apic_irq_attr *attr);
-extern struct IO_APIC_route_entry **alloc_ioapic_entries(void);
-extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries);
-extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
-extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
-extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
+extern int save_ioapic_entries(void);
+extern void mask_ioapic_entries(void);
+extern int restore_ioapic_entries(void);
extern int get_nr_irqs_gsi(void);
@@ -192,19 +190,13 @@ struct io_apic_irq_attr;
static inline int io_apic_set_pci_routing(struct device *dev, int irq,
struct io_apic_irq_attr *irq_attr) { return 0; }
-static inline struct IO_APIC_route_entry **alloc_ioapic_entries(void)
-{
- return NULL;
-}
-
-static inline void free_ioapic_entries(struct IO_APIC_route_entry **ent) { }
-static inline int save_IO_APIC_setup(struct IO_APIC_route_entry **ent)
+static inline int save_ioapic_entries(void)
{
return -ENOMEM;
}
-static inline void mask_IO_APIC_setup(struct IO_APIC_route_entry **ent) { }
-static inline int restore_IO_APIC_setup(struct IO_APIC_route_entry **ent)
+static inline void mask_ioapic_entries(void) { }
+static inline int restore_ioapic_entries(void)
{
return -ENOMEM;
}
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 6e976ee3b3e..f9a320984a1 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -17,7 +17,8 @@
* Vectors 0 ... 31 : system traps and exceptions - hardcoded events
* Vectors 32 ... 127 : device interrupts
* Vector 128 : legacy int80 syscall interface
- * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 : device interrupts
+ * Vector 204 : legacy x86_64 vsyscall emulation
+ * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 except 204 : device interrupts
* Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts
*
* 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table.
@@ -50,6 +51,9 @@
#ifdef CONFIG_X86_32
# define SYSCALL_VECTOR 0x80
#endif
+#ifdef CONFIG_X86_64
+# define VSYSCALL_EMU_VECTOR 0xcc
+#endif
/*
* Vectors 0x30-0x3f are used for ISA interrupts.
@@ -109,11 +113,6 @@
#define UV_BAU_MESSAGE 0xf5
-/*
- * Self IPI vector for machine checks
- */
-#define MCE_SELF_VECTOR 0xf4
-
/* Xen vector callback to receive events in a HVM domain */
#define XEN_HVM_EVTCHN_CALLBACK 0xf3
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 5745ce8bf10..bba3cf88e62 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -60,23 +60,24 @@ static inline void native_halt(void)
#include <asm/paravirt.h>
#else
#ifndef __ASSEMBLY__
+#include <linux/types.h>
-static inline unsigned long arch_local_save_flags(void)
+static inline notrace unsigned long arch_local_save_flags(void)
{
return native_save_fl();
}
-static inline void arch_local_irq_restore(unsigned long flags)
+static inline notrace void arch_local_irq_restore(unsigned long flags)
{
native_restore_fl(flags);
}
-static inline void arch_local_irq_disable(void)
+static inline notrace void arch_local_irq_disable(void)
{
native_irq_disable();
}
-static inline void arch_local_irq_enable(void)
+static inline notrace void arch_local_irq_enable(void)
{
native_irq_enable();
}
@@ -102,7 +103,7 @@ static inline void halt(void)
/*
* For spinlocks, etc:
*/
-static inline unsigned long arch_local_irq_save(void)
+static inline notrace unsigned long arch_local_irq_save(void)
{
unsigned long flags = arch_local_save_flags();
arch_local_irq_disable();
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index 574dbc22893..a32b18ce6ea 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -5,20 +5,25 @@
#include <linux/types.h>
#include <asm/nops.h>
+#include <asm/asm.h>
#define JUMP_LABEL_NOP_SIZE 5
-# define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t"
-
-# define JUMP_LABEL(key, label) \
- do { \
- asm goto("1:" \
- JUMP_LABEL_INITIAL_NOP \
- ".pushsection __jump_table, \"aw\" \n\t"\
- _ASM_PTR "1b, %l[" #label "], %c0 \n\t" \
- ".popsection \n\t" \
- : : "i" (key) : : label); \
- } while (0)
+#define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t"
+
+static __always_inline bool arch_static_branch(struct jump_label_key *key)
+{
+ asm goto("1:"
+ JUMP_LABEL_INITIAL_NOP
+ ".pushsection __jump_table, \"aw\" \n\t"
+ _ASM_ALIGN "\n\t"
+ _ASM_PTR "1b, %l[l_yes], %c0 \n\t"
+ ".popsection \n\t"
+ : : "i" (key) : : l_yes);
+ return false;
+l_yes:
+ return true;
+}
#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index fe2cc6e105f..d73f1571bde 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -28,7 +28,6 @@ extern void show_registers(struct pt_regs *regs);
extern void show_trace(struct task_struct *t, struct pt_regs *regs,
unsigned long *sp, unsigned long bp);
extern void __show_regs(struct pt_regs *regs, int all);
-extern void show_regs(struct pt_regs *regs);
extern unsigned long oops_begin(void);
extern void oops_end(unsigned long, struct pt_regs *, int signr);
#ifdef CONFIG_KEXEC
diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h
index 396f5b5fc4d..77e95f54570 100644
--- a/arch/x86/include/asm/kgdb.h
+++ b/arch/x86/include/asm/kgdb.h
@@ -77,6 +77,7 @@ static inline void arch_kgdb_breakpoint(void)
}
#define BREAK_INSTR_SIZE 1
#define CACHE_FLUSH_IS_SAFE 1
+#define GDB_ADJUSTS_BREAK_OFFSET
extern int kgdb_ll_trap(int cmd, const char *str,
struct pt_regs *regs, long err, int trap, int sig);
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 0f521356432..6040d115ef5 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -14,6 +14,8 @@
#include <asm/desc_defs.h>
struct x86_emulate_ctxt;
+enum x86_intercept;
+enum x86_intercept_stage;
struct x86_exception {
u8 vector;
@@ -24,6 +26,24 @@ struct x86_exception {
};
/*
+ * This struct is used to carry enough information from the instruction
+ * decoder to main KVM so that a decision can be made whether the
+ * instruction needs to be intercepted or not.
+ */
+struct x86_instruction_info {
+ u8 intercept; /* which intercept */
+ u8 rep_prefix; /* rep prefix? */
+ u8 modrm_mod; /* mod part of modrm */
+ u8 modrm_reg; /* index of register used */
+ u8 modrm_rm; /* rm part of modrm */
+ u64 src_val; /* value of source operand */
+ u8 src_bytes; /* size of source operand */
+ u8 dst_bytes; /* size of destination operand */
+ u8 ad_bytes; /* size of src/dst address */
+ u64 next_rip; /* rip following the instruction */
+};
+
+/*
* x86_emulate_ops:
*
* These operations represent the instruction emulator's interface to memory.
@@ -62,6 +82,7 @@ struct x86_exception {
#define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */
#define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */
#define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */
+#define X86EMUL_INTERCEPTED 6 /* Intercepted by nested VMCB/VMCS */
struct x86_emulate_ops {
/*
@@ -71,8 +92,9 @@ struct x86_emulate_ops {
* @val: [OUT] Value read from memory, zero-extended to 'u_long'.
* @bytes: [IN ] Number of bytes to read from memory.
*/
- int (*read_std)(unsigned long addr, void *val,
- unsigned int bytes, struct kvm_vcpu *vcpu,
+ int (*read_std)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, void *val,
+ unsigned int bytes,
struct x86_exception *fault);
/*
@@ -82,8 +104,8 @@ struct x86_emulate_ops {
* @val: [OUT] Value write to memory, zero-extended to 'u_long'.
* @bytes: [IN ] Number of bytes to write to memory.
*/
- int (*write_std)(unsigned long addr, void *val,
- unsigned int bytes, struct kvm_vcpu *vcpu,
+ int (*write_std)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, void *val, unsigned int bytes,
struct x86_exception *fault);
/*
* fetch: Read bytes of standard (non-emulated/special) memory.
@@ -92,8 +114,8 @@ struct x86_emulate_ops {
* @val: [OUT] Value read from memory, zero-extended to 'u_long'.
* @bytes: [IN ] Number of bytes to read from memory.
*/
- int (*fetch)(unsigned long addr, void *val,
- unsigned int bytes, struct kvm_vcpu *vcpu,
+ int (*fetch)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, void *val, unsigned int bytes,
struct x86_exception *fault);
/*
@@ -102,11 +124,9 @@ struct x86_emulate_ops {
* @val: [OUT] Value read from memory, zero-extended to 'u_long'.
* @bytes: [IN ] Number of bytes to read from memory.
*/
- int (*read_emulated)(unsigned long addr,
- void *val,
- unsigned int bytes,
- struct x86_exception *fault,
- struct kvm_vcpu *vcpu);
+ int (*read_emulated)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, void *val, unsigned int bytes,
+ struct x86_exception *fault);
/*
* write_emulated: Write bytes to emulated/special memory area.
@@ -115,11 +135,10 @@ struct x86_emulate_ops {
* required).
* @bytes: [IN ] Number of bytes to write to memory.
*/
- int (*write_emulated)(unsigned long addr,
- const void *val,
+ int (*write_emulated)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, const void *val,
unsigned int bytes,
- struct x86_exception *fault,
- struct kvm_vcpu *vcpu);
+ struct x86_exception *fault);
/*
* cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an
@@ -129,40 +148,54 @@ struct x86_emulate_ops {
* @new: [IN ] Value to write to @addr.
* @bytes: [IN ] Number of bytes to access using CMPXCHG.
*/
- int (*cmpxchg_emulated)(unsigned long addr,
+ int (*cmpxchg_emulated)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr,
const void *old,
const void *new,
unsigned int bytes,
- struct x86_exception *fault,
- struct kvm_vcpu *vcpu);
-
- int (*pio_in_emulated)(int size, unsigned short port, void *val,
- unsigned int count, struct kvm_vcpu *vcpu);
-
- int (*pio_out_emulated)(int size, unsigned short port, const void *val,
- unsigned int count, struct kvm_vcpu *vcpu);
-
- bool (*get_cached_descriptor)(struct desc_struct *desc, u32 *base3,
- int seg, struct kvm_vcpu *vcpu);
- void (*set_cached_descriptor)(struct desc_struct *desc, u32 base3,
- int seg, struct kvm_vcpu *vcpu);
- u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu);
- void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
- unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu);
- void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
- void (*get_idt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
- ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
- int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
- int (*cpl)(struct kvm_vcpu *vcpu);
- int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu);
- int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu);
- int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
- int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
+ struct x86_exception *fault);
+ void (*invlpg)(struct x86_emulate_ctxt *ctxt, ulong addr);
+
+ int (*pio_in_emulated)(struct x86_emulate_ctxt *ctxt,
+ int size, unsigned short port, void *val,
+ unsigned int count);
+
+ int (*pio_out_emulated)(struct x86_emulate_ctxt *ctxt,
+ int size, unsigned short port, const void *val,
+ unsigned int count);
+
+ bool (*get_segment)(struct x86_emulate_ctxt *ctxt, u16 *selector,
+ struct desc_struct *desc, u32 *base3, int seg);
+ void (*set_segment)(struct x86_emulate_ctxt *ctxt, u16 selector,
+ struct desc_struct *desc, u32 base3, int seg);
+ unsigned long (*get_cached_segment_base)(struct x86_emulate_ctxt *ctxt,
+ int seg);
+ void (*get_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
+ void (*get_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
+ void (*set_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
+ void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
+ ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr);
+ int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val);
+ int (*cpl)(struct x86_emulate_ctxt *ctxt);
+ int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
+ int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
+ int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
+ int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
+ void (*halt)(struct x86_emulate_ctxt *ctxt);
+ void (*wbinvd)(struct x86_emulate_ctxt *ctxt);
+ int (*fix_hypercall)(struct x86_emulate_ctxt *ctxt);
+ void (*get_fpu)(struct x86_emulate_ctxt *ctxt); /* disables preempt */
+ void (*put_fpu)(struct x86_emulate_ctxt *ctxt); /* reenables preempt */
+ int (*intercept)(struct x86_emulate_ctxt *ctxt,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage);
};
+typedef u32 __attribute__((vector_size(16))) sse128_t;
+
/* Type, address-of, and value of an instruction's operand. */
struct operand {
- enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
+ enum { OP_REG, OP_MEM, OP_IMM, OP_XMM, OP_NONE } type;
unsigned int bytes;
union {
unsigned long orig_val;
@@ -174,11 +207,13 @@ struct operand {
ulong ea;
unsigned seg;
} mem;
+ unsigned xmm;
} addr;
union {
unsigned long val;
u64 val64;
char valptr[sizeof(unsigned long) + 2];
+ sse128_t vec_val;
};
};
@@ -194,9 +229,29 @@ struct read_cache {
unsigned long end;
};
-struct decode_cache {
+struct x86_emulate_ctxt {
+ struct x86_emulate_ops *ops;
+
+ /* Register state before/after emulation. */
+ unsigned long eflags;
+ unsigned long eip; /* eip before instruction emulation */
+ /* Emulated execution mode, represented by an X86EMUL_MODE value. */
+ int mode;
+
+ /* interruptibility state, as a result of execution of STI or MOV SS */
+ int interruptibility;
+
+ bool guest_mode; /* guest running a nested guest */
+ bool perm_ok; /* do not check permissions if true */
+ bool only_vendor_specific_insn;
+
+ bool have_exception;
+ struct x86_exception exception;
+
+ /* decode cache */
u8 twobyte;
u8 b;
+ u8 intercept;
u8 lock_prefix;
u8 rep_prefix;
u8 op_bytes;
@@ -209,8 +264,7 @@ struct decode_cache {
u8 seg_override;
unsigned int d;
int (*execute)(struct x86_emulate_ctxt *ctxt);
- unsigned long regs[NR_VCPU_REGS];
- unsigned long eip;
+ int (*check_perm)(struct x86_emulate_ctxt *ctxt);
/* modrm */
u8 modrm;
u8 modrm_mod;
@@ -218,39 +272,17 @@ struct decode_cache {
u8 modrm_rm;
u8 modrm_seg;
bool rip_relative;
+ unsigned long _eip;
+ /* Fields above regs are cleared together. */
+ unsigned long regs[NR_VCPU_REGS];
struct fetch_cache fetch;
struct read_cache io_read;
struct read_cache mem_read;
};
-struct x86_emulate_ctxt {
- struct x86_emulate_ops *ops;
-
- /* Register state before/after emulation. */
- struct kvm_vcpu *vcpu;
-
- unsigned long eflags;
- unsigned long eip; /* eip before instruction emulation */
- /* Emulated execution mode, represented by an X86EMUL_MODE value. */
- int mode;
- u32 cs_base;
-
- /* interruptibility state, as a result of execution of STI or MOV SS */
- int interruptibility;
-
- bool perm_ok; /* do not check permissions if true */
- bool only_vendor_specific_insn;
-
- bool have_exception;
- struct x86_exception exception;
-
- /* decode cache */
- struct decode_cache decode;
-};
-
/* Repeat String Operation Prefix */
-#define REPE_PREFIX 1
-#define REPNE_PREFIX 2
+#define REPE_PREFIX 0xf3
+#define REPNE_PREFIX 0xf2
/* Execution mode, passed to the emulator. */
#define X86EMUL_MODE_REAL 0 /* Real mode. */
@@ -259,6 +291,69 @@ struct x86_emulate_ctxt {
#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */
#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */
+/* any protected mode */
+#define X86EMUL_MODE_PROT (X86EMUL_MODE_PROT16|X86EMUL_MODE_PROT32| \
+ X86EMUL_MODE_PROT64)
+
+enum x86_intercept_stage {
+ X86_ICTP_NONE = 0, /* Allow zero-init to not match anything */
+ X86_ICPT_PRE_EXCEPT,
+ X86_ICPT_POST_EXCEPT,
+ X86_ICPT_POST_MEMACCESS,
+};
+
+enum x86_intercept {
+ x86_intercept_none,
+ x86_intercept_cr_read,
+ x86_intercept_cr_write,
+ x86_intercept_clts,
+ x86_intercept_lmsw,
+ x86_intercept_smsw,
+ x86_intercept_dr_read,
+ x86_intercept_dr_write,
+ x86_intercept_lidt,
+ x86_intercept_sidt,
+ x86_intercept_lgdt,
+ x86_intercept_sgdt,
+ x86_intercept_lldt,
+ x86_intercept_sldt,
+ x86_intercept_ltr,
+ x86_intercept_str,
+ x86_intercept_rdtsc,
+ x86_intercept_rdpmc,
+ x86_intercept_pushf,
+ x86_intercept_popf,
+ x86_intercept_cpuid,
+ x86_intercept_rsm,
+ x86_intercept_iret,
+ x86_intercept_intn,
+ x86_intercept_invd,
+ x86_intercept_pause,
+ x86_intercept_hlt,
+ x86_intercept_invlpg,
+ x86_intercept_invlpga,
+ x86_intercept_vmrun,
+ x86_intercept_vmload,
+ x86_intercept_vmsave,
+ x86_intercept_vmmcall,
+ x86_intercept_stgi,
+ x86_intercept_clgi,
+ x86_intercept_skinit,
+ x86_intercept_rdtscp,
+ x86_intercept_icebp,
+ x86_intercept_wbinvd,
+ x86_intercept_monitor,
+ x86_intercept_mwait,
+ x86_intercept_rdmsr,
+ x86_intercept_wrmsr,
+ x86_intercept_in,
+ x86_intercept_ins,
+ x86_intercept_out,
+ x86_intercept_outs,
+
+ nr_x86_intercepts
+};
+
/* Host execution mode. */
#if defined(CONFIG_X86_32)
#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
@@ -270,10 +365,10 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len);
#define EMULATION_FAILED -1
#define EMULATION_OK 0
#define EMULATION_RESTART 1
+#define EMULATION_INTERCEPTED 2
int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
u16 tss_selector, int reason,
bool has_error_code, u32 error_code);
-int emulate_int_real(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops, int irq);
+int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq);
#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c8af0991fdf..dd51c83aa5d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -30,14 +30,30 @@
#define KVM_MEMORY_SLOTS 32
/* memory slots that does not exposed to userspace */
#define KVM_PRIVATE_MEM_SLOTS 4
+#define KVM_MMIO_SIZE 16
#define KVM_PIO_PAGE_OFFSET 1
#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
+#define CR0_RESERVED_BITS \
+ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
+ | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
+ | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
+
#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
0xFFFFFF0000000000ULL)
+#define CR4_RESERVED_BITS \
+ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
+ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
+ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
+ | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_RDWRGSFS \
+ | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
+
+#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
+
+
#define INVALID_PAGE (~(hpa_t)0)
#define VALID_PAGE(x) ((x) != INVALID_PAGE)
@@ -118,6 +134,9 @@ enum kvm_reg {
enum kvm_reg_ex {
VCPU_EXREG_PDPTR = NR_VCPU_REGS,
VCPU_EXREG_CR3,
+ VCPU_EXREG_RFLAGS,
+ VCPU_EXREG_CPL,
+ VCPU_EXREG_SEGMENTS,
};
enum {
@@ -186,6 +205,7 @@ union kvm_mmu_page_role {
unsigned invalid:1;
unsigned nxe:1;
unsigned cr0_wp:1;
+ unsigned smep_andnot_wp:1;
};
};
@@ -208,15 +228,17 @@ struct kvm_mmu_page {
* in this shadow page.
*/
DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
- bool multimapped; /* More than one parent_pte? */
bool unsync;
int root_count; /* Currently serving as active root */
unsigned int unsync_children;
- union {
- u64 *parent_pte; /* !multimapped */
- struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
- };
+ unsigned long parent_ptes; /* Reverse mapping for parent_pte */
DECLARE_BITMAP(unsync_child_bitmap, 512);
+
+#ifdef CONFIG_X86_32
+ int clear_spte_count;
+#endif
+
+ struct rcu_head rcu;
};
struct kvm_pv_mmu_op_buffer {
@@ -250,13 +272,11 @@ struct kvm_mmu {
gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
struct x86_exception *exception);
gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
- void (*prefetch_page)(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *page);
int (*sync_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp);
void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- u64 *spte, const void *pte, unsigned long mmu_seq);
+ u64 *spte, const void *pte);
hpa_t root_hpa;
int root_level;
int shadow_root_level;
@@ -327,8 +347,7 @@ struct kvm_vcpu_arch {
* put it here to avoid allocation */
struct kvm_pv_mmu_op_buffer mmu_op_buffer;
- struct kvm_mmu_memory_cache mmu_pte_chain_cache;
- struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
+ struct kvm_mmu_memory_cache mmu_pte_list_desc_cache;
struct kvm_mmu_memory_cache mmu_page_cache;
struct kvm_mmu_memory_cache mmu_page_header_cache;
@@ -340,7 +359,6 @@ struct kvm_vcpu_arch {
struct fpu guest_fpu;
u64 xcr0;
- gva_t mmio_fault_cr2;
struct kvm_pio_request pio;
void *pio_data;
@@ -367,18 +385,31 @@ struct kvm_vcpu_arch {
/* emulate context */
struct x86_emulate_ctxt emulate_ctxt;
+ bool emulate_regs_need_sync_to_vcpu;
+ bool emulate_regs_need_sync_from_vcpu;
gpa_t time;
struct pvclock_vcpu_time_info hv_clock;
unsigned int hw_tsc_khz;
unsigned int time_offset;
struct page *time_page;
- u64 last_host_tsc;
+
+ struct {
+ u64 msr_val;
+ u64 last_steal;
+ u64 accum_steal;
+ struct gfn_to_hva_cache stime;
+ struct kvm_steal_time steal;
+ } st;
+
u64 last_guest_tsc;
u64 last_kernel_ns;
u64 last_tsc_nsec;
u64 last_tsc_write;
+ u32 virtual_tsc_khz;
bool tsc_catchup;
+ u32 tsc_catchup_mult;
+ s8 tsc_catchup_shift;
bool nmi_pending;
bool nmi_injected;
@@ -397,6 +428,11 @@ struct kvm_vcpu_arch {
u64 mcg_ctl;
u64 *mce_banks;
+ /* Cache MMIO info */
+ u64 mmio_gva;
+ unsigned access;
+ gfn_t mmio_gfn;
+
/* used for guest single stepping over the given code position */
unsigned long singlestep_rip;
@@ -419,6 +455,7 @@ struct kvm_arch {
unsigned int n_used_mmu_pages;
unsigned int n_requested_mmu_pages;
unsigned int n_max_mmu_pages;
+ unsigned int indirect_shadow_pages;
atomic_t invlpg_counter;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
/*
@@ -448,9 +485,6 @@ struct kvm_arch {
u64 last_tsc_nsec;
u64 last_tsc_offset;
u64 last_tsc_write;
- u32 virtual_tsc_khz;
- u32 virtual_tsc_mult;
- s8 virtual_tsc_shift;
struct kvm_xen_hvm_config xen_hvm_config;
@@ -458,6 +492,8 @@ struct kvm_arch {
u64 hv_guest_os_id;
u64 hv_hypercall;
+ atomic_t reader_counter;
+
#ifdef CONFIG_KVM_MMU_AUDIT
int audit_point;
#endif
@@ -502,6 +538,8 @@ struct kvm_vcpu_stat {
u32 nmi_injections;
};
+struct x86_instruction_info;
+
struct kvm_x86_ops {
int (*cpu_has_kvm_support)(void); /* __init */
int (*disabled_by_bios)(void); /* __init */
@@ -538,7 +576,7 @@ struct kvm_x86_ops {
void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
- void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
+ int (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
@@ -586,9 +624,17 @@ struct kvm_x86_ops {
bool (*has_wbinvd_exit)(void);
+ void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz);
void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
+ u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc);
+
void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
+
+ int (*check_intercept)(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage);
+
const struct trace_print_flags *exit_reasons_str;
};
@@ -607,7 +653,6 @@ void kvm_mmu_module_exit(void);
void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
int kvm_mmu_create(struct kvm_vcpu *vcpu);
int kvm_mmu_setup(struct kvm_vcpu *vcpu);
-void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask);
@@ -627,6 +672,13 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
extern bool tdp_enabled;
+/* control of guest tsc rate supported? */
+extern bool kvm_has_tsc_control;
+/* minimum supported tsc_khz for guests */
+extern u32 kvm_min_guest_tsc_khz;
+/* maximum supported tsc_khz for guests */
+extern u32 kvm_max_guest_tsc_khz;
+
enum emulation_result {
EMULATE_DONE, /* no further processing */
EMULATE_DO_MMIO, /* kvm_run filled with mmio request */
@@ -645,9 +697,6 @@ static inline int emulate_instruction(struct kvm_vcpu *vcpu,
return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
}
-void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
-void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
-
void kvm_enable_efer_bits(u64);
int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
@@ -657,8 +706,6 @@ struct x86_emulate_ctxt;
int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port);
void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
int kvm_emulate_halt(struct kvm_vcpu *vcpu);
-int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
-int emulate_clts(struct kvm_vcpu *vcpu);
int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
@@ -721,8 +768,6 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
-int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
-
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
void *insn, int insn_len);
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
@@ -801,11 +846,12 @@ enum {
asmlinkage void kvm_spurious_fault(void);
extern bool kvm_rebooting;
-#define __kvm_handle_fault_on_reboot(insn) \
+#define ____kvm_handle_fault_on_reboot(insn, cleanup_insn) \
"666: " insn "\n\t" \
"668: \n\t" \
".pushsection .fixup, \"ax\" \n" \
"667: \n\t" \
+ cleanup_insn "\n\t" \
"cmpb $0, kvm_rebooting \n\t" \
"jne 668b \n\t" \
__ASM_SIZE(push) " $666b \n\t" \
@@ -815,6 +861,9 @@ extern bool kvm_rebooting;
_ASM_PTR " 666b, 667b \n\t" \
".popsection"
+#define __kvm_handle_fault_on_reboot(insn) \
+ ____kvm_handle_fault_on_reboot(insn, "")
+
#define KVM_ARCH_WANT_MMU_NOTIFIER
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
int kvm_age_hva(struct kvm *kvm, unsigned long hva);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index a427bf77a93..734c3767cfa 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -21,6 +21,7 @@
*/
#define KVM_FEATURE_CLOCKSOURCE2 3
#define KVM_FEATURE_ASYNC_PF 4
+#define KVM_FEATURE_STEAL_TIME 5
/* The last 8 bits are used to indicate how to interpret the flags field
* in pvclock structure. If no bits are set, all flags are ignored.
@@ -30,10 +31,23 @@
#define MSR_KVM_WALL_CLOCK 0x11
#define MSR_KVM_SYSTEM_TIME 0x12
+#define KVM_MSR_ENABLED 1
/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00
#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
#define MSR_KVM_ASYNC_PF_EN 0x4b564d02
+#define MSR_KVM_STEAL_TIME 0x4b564d03
+
+struct kvm_steal_time {
+ __u64 steal;
+ __u32 version;
+ __u32 flags;
+ __u32 pad[12];
+};
+
+#define KVM_STEAL_ALIGNMENT_BITS 5
+#define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1)))
+#define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1)
#define KVM_MAX_MMU_OP_BATCH 32
@@ -178,6 +192,7 @@ void __init kvm_guest_init(void);
void kvm_async_pf_task_wait(u32 token);
void kvm_async_pf_task_wake(u32 token);
u32 kvm_read_and_reset_pf_reason(void);
+extern void kvm_disable_steal_time(void);
#else
#define kvm_guest_init() do { } while (0)
#define kvm_async_pf_task_wait(T) do {} while(0)
@@ -186,6 +201,11 @@ static inline u32 kvm_read_and_reset_pf_reason(void)
{
return 0;
}
+
+static inline void kvm_disable_steal_time(void)
+{
+ return;
+}
#endif
#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index b60f2924c41..879fd7d3387 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -61,6 +61,7 @@ hcall(unsigned long call,
: "memory");
return call;
}
+/*:*/
/* Can't use our min() macro here: needs to be a constant */
#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index 12d55e773eb..48142971b25 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -8,11 +8,6 @@
#ifdef CONFIG_X86_32
#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
-/*
- * For 32-bit UML - mark functions implemented in assembly that use
- * regparm input parameters:
- */
-#define asmregparm __attribute__((regparm(3)))
/*
* Make sure the compiler doesn't do anything stupid with the
diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h
index 2e9972468a5..9cdae5d47e8 100644
--- a/arch/x86/include/asm/local.h
+++ b/arch/x86/include/asm/local.h
@@ -4,7 +4,7 @@
#include <linux/percpu.h>
#include <asm/system.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <asm/asm.h>
typedef struct {
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index eb16e94ae04..c9321f34e55 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -8,6 +8,7 @@
* Machine Check support for x86
*/
+/* MCG_CAP register defines */
#define MCG_BANKCNT_MASK 0xff /* Number of Banks */
#define MCG_CTL_P (1ULL<<8) /* MCG_CTL register available */
#define MCG_EXT_P (1ULL<<9) /* Extended registers available */
@@ -17,10 +18,12 @@
#define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT)
#define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */
+/* MCG_STATUS register defines */
#define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */
#define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */
#define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */
+/* MCi_STATUS register defines */
#define MCI_STATUS_VAL (1ULL<<63) /* valid error */
#define MCI_STATUS_OVER (1ULL<<62) /* previous errors lost */
#define MCI_STATUS_UC (1ULL<<61) /* uncorrected error */
@@ -31,12 +34,14 @@
#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */
#define MCI_STATUS_AR (1ULL<<55) /* Action required */
-/* MISC register defines */
-#define MCM_ADDR_SEGOFF 0 /* segment offset */
-#define MCM_ADDR_LINEAR 1 /* linear address */
-#define MCM_ADDR_PHYS 2 /* physical address */
-#define MCM_ADDR_MEM 3 /* memory address */
-#define MCM_ADDR_GENERIC 7 /* generic */
+/* MCi_MISC register defines */
+#define MCI_MISC_ADDR_LSB(m) ((m) & 0x3f)
+#define MCI_MISC_ADDR_MODE(m) (((m) >> 6) & 7)
+#define MCI_MISC_ADDR_SEGOFF 0 /* segment offset */
+#define MCI_MISC_ADDR_LINEAR 1 /* linear address */
+#define MCI_MISC_ADDR_PHYS 2 /* physical address */
+#define MCI_MISC_ADDR_MEM 3 /* memory address */
+#define MCI_MISC_ADDR_GENERIC 7 /* generic */
/* CTL2 register defines */
#define MCI_CTL2_CMCI_EN (1ULL << 30)
@@ -119,7 +124,7 @@ extern struct atomic_notifier_head x86_mce_decoder_chain;
#include <linux/percpu.h>
#include <linux/init.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
extern int mce_disabled;
extern int mce_p5_enabled;
@@ -142,11 +147,9 @@ static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
static inline void enable_p5_mce(void) {}
#endif
-extern void (*x86_mce_decode_callback)(struct mce *m);
-
void mce_setup(struct mce *m);
void mce_log(struct mce *m);
-DECLARE_PER_CPU(struct sys_device, mce_dev);
+DECLARE_PER_CPU(struct sys_device, mce_sysdev);
/*
* Maximum banks number.
diff --git a/arch/x86/include/asm/memblock.h b/arch/x86/include/asm/memblock.h
index 19ae14ba697..0cd3800f33b 100644
--- a/arch/x86/include/asm/memblock.h
+++ b/arch/x86/include/asm/memblock.h
@@ -4,7 +4,6 @@
#define ARCH_DISCARD_MEMBLOCK
u64 memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align);
-void memblock_x86_to_bootmem(u64 start, u64 end);
void memblock_x86_reserve_range(u64 start, u64 end, char *name);
void memblock_x86_free_range(u64 start, u64 end);
@@ -19,5 +18,6 @@ u64 memblock_x86_hole_size(u64 start, u64 end);
u64 memblock_x86_find_in_range_node(int nid, u64 start, u64 end, u64 size, u64 align);
u64 memblock_x86_free_memory_in_range(u64 addr, u64 limit);
u64 memblock_x86_memory_in_range(u64 addr, u64 limit);
+bool memblock_x86_check_reserved_size(u64 *addrp, u64 *sizep, u64 align);
#endif
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index aeff3e89b22..5f55e696276 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -11,14 +11,14 @@
typedef struct {
void *ldt;
int size;
- struct mutex lock;
- void *vdso;
#ifdef CONFIG_X86_64
/* True if mm supports a task running in 32 bit compatibility mode. */
unsigned short ia32_compat;
#endif
+ struct mutex lock;
+ void *vdso;
} mm_context_t;
#ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 8b5393ec108..69021528b43 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -2,7 +2,7 @@
#define _ASM_X86_MMU_CONTEXT_H
#include <asm/desc.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/paravirt.h>
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index 91df7c51806..55728e12147 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -13,31 +13,11 @@ extern struct pglist_data *node_data[];
#define NODE_DATA(nid) (node_data[nid])
#include <asm/numaq.h>
-/* summit or generic arch */
-#include <asm/srat.h>
-
-extern int get_memcfg_numa_flat(void);
-/*
- * This allows any one NUMA architecture to be compiled
- * for, and still fall back to the flat function if it
- * fails.
- */
-static inline void get_memcfg_numa(void)
-{
-
- if (get_memcfg_numaq())
- return;
- if (get_memcfg_from_srat())
- return;
- get_memcfg_numa_flat();
-}
extern void resume_map_numa_kva(pgd_t *pgd);
#else /* !CONFIG_NUMA */
-#define get_memcfg_numa get_memcfg_numa_flat
-
static inline void resume_map_numa_kva(pgd_t *pgd) {}
#endif /* CONFIG_NUMA */
@@ -54,31 +34,20 @@ static inline void resume_map_numa_kva(pgd_t *pgd) {}
* 64Gb / 4096bytes/page = 16777216 pages
*/
#define MAX_NR_PAGES 16777216
-#define MAX_ELEMENTS 1024
-#define PAGES_PER_ELEMENT (MAX_NR_PAGES/MAX_ELEMENTS)
+#define MAX_SECTIONS 1024
+#define PAGES_PER_SECTION (MAX_NR_PAGES/MAX_SECTIONS)
extern s8 physnode_map[];
static inline int pfn_to_nid(unsigned long pfn)
{
#ifdef CONFIG_NUMA
- return((int) physnode_map[(pfn) / PAGES_PER_ELEMENT]);
+ return((int) physnode_map[(pfn) / PAGES_PER_SECTION]);
#else
return 0;
#endif
}
-/*
- * Following are macros that each numa implmentation must define.
- */
-
-#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
-#define node_end_pfn(nid) \
-({ \
- pg_data_t *__pgdat = NODE_DATA(nid); \
- __pgdat->node_start_pfn + __pgdat->node_spanned_pages; \
-})
-
static inline int pfn_valid(int pfn)
{
int nid = pfn_to_nid(pfn);
@@ -88,6 +57,8 @@ static inline int pfn_valid(int pfn)
return 0;
}
+#define early_pfn_valid(pfn) pfn_valid((pfn))
+
#endif /* CONFIG_DISCONTIGMEM */
#ifdef CONFIG_NEED_MULTIPLE_NODES
diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h
index 288b96f815a..129d9aa3ceb 100644
--- a/arch/x86/include/asm/mmzone_64.h
+++ b/arch/x86/include/asm/mmzone_64.h
@@ -4,40 +4,14 @@
#ifndef _ASM_X86_MMZONE_64_H
#define _ASM_X86_MMZONE_64_H
-
#ifdef CONFIG_NUMA
#include <linux/mmdebug.h>
-
#include <asm/smp.h>
-/* Simple perfect hash to map physical addresses to node numbers */
-struct memnode {
- int shift;
- unsigned int mapsize;
- s16 *map;
- s16 embedded_map[64 - 8];
-} ____cacheline_aligned; /* total size = 128 bytes */
-extern struct memnode memnode;
-#define memnode_shift memnode.shift
-#define memnodemap memnode.map
-#define memnodemapsize memnode.mapsize
-
extern struct pglist_data *node_data[];
-static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
-{
- unsigned nid;
- VIRTUAL_BUG_ON(!memnodemap);
- nid = memnodemap[addr >> memnode_shift];
- VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]);
- return nid;
-}
-
#define NODE_DATA(nid) (node_data[nid])
-#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
-#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \
- NODE_DATA(nid)->node_spanned_pages)
#endif
#endif /* _ASM_X86_MMZONE_64_H */
diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
index 67763c5d8b4..9eae7752ae9 100644
--- a/arch/x86/include/asm/module.h
+++ b/arch/x86/include/asm/module.h
@@ -35,7 +35,7 @@
#define MODULE_PROC_FAMILY "K7 "
#elif defined CONFIG_MK8
#define MODULE_PROC_FAMILY "K8 "
-#elif defined CONFIG_X86_ELAN
+#elif defined CONFIG_MELAN
#define MODULE_PROC_FAMILY "ELAN "
#elif defined CONFIG_MCRUSOE
#define MODULE_PROC_FAMILY "CRUSOE "
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 3cce71413d0..d52609aeeab 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -118,6 +118,7 @@
complete list. */
#define MSR_AMD64_PATCH_LEVEL 0x0000008b
+#define MSR_AMD64_TSC_RATIO 0xc0000104
#define MSR_AMD64_NB_CFG 0xc001001f
#define MSR_AMD64_PATCH_LOADER 0xc0010020
#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140
@@ -258,6 +259,9 @@
#define MSR_IA32_TEMPERATURE_TARGET 0x000001a2
#define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0
+#define ENERGY_PERF_BIAS_PERFORMANCE 0
+#define ENERGY_PERF_BIAS_NORMAL 6
+#define ENERGY_PERF_BIAS_POWERSAVE 15
#define MSR_IA32_PACKAGE_THERM_STATUS 0x000001b1
@@ -437,6 +441,18 @@
#define MSR_IA32_VMX_VMCS_ENUM 0x0000048a
#define MSR_IA32_VMX_PROCBASED_CTLS2 0x0000048b
#define MSR_IA32_VMX_EPT_VPID_CAP 0x0000048c
+#define MSR_IA32_VMX_TRUE_PINBASED_CTLS 0x0000048d
+#define MSR_IA32_VMX_TRUE_PROCBASED_CTLS 0x0000048e
+#define MSR_IA32_VMX_TRUE_EXIT_CTLS 0x0000048f
+#define MSR_IA32_VMX_TRUE_ENTRY_CTLS 0x00000490
+
+/* VMX_BASIC bits and bitmasks */
+#define VMX_BASIC_VMCS_SIZE_SHIFT 32
+#define VMX_BASIC_64 0x0001000000000000LLU
+#define VMX_BASIC_MEM_TYPE_SHIFT 50
+#define VMX_BASIC_MEM_TYPE_MASK 0x003c000000000000LLU
+#define VMX_BASIC_MEM_TYPE_WB 6LLU
+#define VMX_BASIC_INOUT 0x0040000000000000LLU
/* AMD-V MSRs */
diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h
index af788496020..405b4032a60 100644
--- a/arch/x86/include/asm/nops.h
+++ b/arch/x86/include/asm/nops.h
@@ -1,7 +1,13 @@
#ifndef _ASM_X86_NOPS_H
#define _ASM_X86_NOPS_H
-/* Define nops for use with alternative() */
+/*
+ * Define nops for use with alternative() and for tracing.
+ *
+ * *_NOP5_ATOMIC must be a single instruction.
+ */
+
+#define NOP_DS_PREFIX 0x3e
/* generic versions from gas
1: nop
@@ -13,14 +19,15 @@
6: leal 0x00000000(%esi),%esi
7: leal 0x00000000(,%esi,1),%esi
*/
-#define GENERIC_NOP1 ".byte 0x90\n"
-#define GENERIC_NOP2 ".byte 0x89,0xf6\n"
-#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n"
-#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n"
-#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4
-#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n"
-#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n"
-#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7
+#define GENERIC_NOP1 0x90
+#define GENERIC_NOP2 0x89,0xf6
+#define GENERIC_NOP3 0x8d,0x76,0x00
+#define GENERIC_NOP4 0x8d,0x74,0x26,0x00
+#define GENERIC_NOP5 GENERIC_NOP1,GENERIC_NOP4
+#define GENERIC_NOP6 0x8d,0xb6,0x00,0x00,0x00,0x00
+#define GENERIC_NOP7 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00
+#define GENERIC_NOP8 GENERIC_NOP1,GENERIC_NOP7
+#define GENERIC_NOP5_ATOMIC NOP_DS_PREFIX,GENERIC_NOP4
/* Opteron 64bit nops
1: nop
@@ -29,13 +36,14 @@
4: osp osp osp nop
*/
#define K8_NOP1 GENERIC_NOP1
-#define K8_NOP2 ".byte 0x66,0x90\n"
-#define K8_NOP3 ".byte 0x66,0x66,0x90\n"
-#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n"
-#define K8_NOP5 K8_NOP3 K8_NOP2
-#define K8_NOP6 K8_NOP3 K8_NOP3
-#define K8_NOP7 K8_NOP4 K8_NOP3
-#define K8_NOP8 K8_NOP4 K8_NOP4
+#define K8_NOP2 0x66,K8_NOP1
+#define K8_NOP3 0x66,K8_NOP2
+#define K8_NOP4 0x66,K8_NOP3
+#define K8_NOP5 K8_NOP3,K8_NOP2
+#define K8_NOP6 K8_NOP3,K8_NOP3
+#define K8_NOP7 K8_NOP4,K8_NOP3
+#define K8_NOP8 K8_NOP4,K8_NOP4
+#define K8_NOP5_ATOMIC 0x66,K8_NOP4
/* K7 nops
uses eax dependencies (arbitrary choice)
@@ -47,13 +55,14 @@
7: leal 0x00000000(,%eax,1),%eax
*/
#define K7_NOP1 GENERIC_NOP1
-#define K7_NOP2 ".byte 0x8b,0xc0\n"
-#define K7_NOP3 ".byte 0x8d,0x04,0x20\n"
-#define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n"
-#define K7_NOP5 K7_NOP4 ASM_NOP1
-#define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n"
-#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n"
-#define K7_NOP8 K7_NOP7 ASM_NOP1
+#define K7_NOP2 0x8b,0xc0
+#define K7_NOP3 0x8d,0x04,0x20
+#define K7_NOP4 0x8d,0x44,0x20,0x00
+#define K7_NOP5 K7_NOP4,K7_NOP1
+#define K7_NOP6 0x8d,0x80,0,0,0,0
+#define K7_NOP7 0x8D,0x04,0x05,0,0,0,0
+#define K7_NOP8 K7_NOP7,K7_NOP1
+#define K7_NOP5_ATOMIC NOP_DS_PREFIX,K7_NOP4
/* P6 nops
uses eax dependencies (Intel-recommended choice)
@@ -69,52 +78,65 @@
There is kernel code that depends on this.
*/
#define P6_NOP1 GENERIC_NOP1
-#define P6_NOP2 ".byte 0x66,0x90\n"
-#define P6_NOP3 ".byte 0x0f,0x1f,0x00\n"
-#define P6_NOP4 ".byte 0x0f,0x1f,0x40,0\n"
-#define P6_NOP5 ".byte 0x0f,0x1f,0x44,0x00,0\n"
-#define P6_NOP6 ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n"
-#define P6_NOP7 ".byte 0x0f,0x1f,0x80,0,0,0,0\n"
-#define P6_NOP8 ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n"
+#define P6_NOP2 0x66,0x90
+#define P6_NOP3 0x0f,0x1f,0x00
+#define P6_NOP4 0x0f,0x1f,0x40,0
+#define P6_NOP5 0x0f,0x1f,0x44,0x00,0
+#define P6_NOP6 0x66,0x0f,0x1f,0x44,0x00,0
+#define P6_NOP7 0x0f,0x1f,0x80,0,0,0,0
+#define P6_NOP8 0x0f,0x1f,0x84,0x00,0,0,0,0
+#define P6_NOP5_ATOMIC P6_NOP5
+
+#define _ASM_MK_NOP(x) ".byte " __stringify(x) "\n"
#if defined(CONFIG_MK7)
-#define ASM_NOP1 K7_NOP1
-#define ASM_NOP2 K7_NOP2
-#define ASM_NOP3 K7_NOP3
-#define ASM_NOP4 K7_NOP4
-#define ASM_NOP5 K7_NOP5
-#define ASM_NOP6 K7_NOP6
-#define ASM_NOP7 K7_NOP7
-#define ASM_NOP8 K7_NOP8
+#define ASM_NOP1 _ASM_MK_NOP(K7_NOP1)
+#define ASM_NOP2 _ASM_MK_NOP(K7_NOP2)
+#define ASM_NOP3 _ASM_MK_NOP(K7_NOP3)
+#define ASM_NOP4 _ASM_MK_NOP(K7_NOP4)
+#define ASM_NOP5 _ASM_MK_NOP(K7_NOP5)
+#define ASM_NOP6 _ASM_MK_NOP(K7_NOP6)
+#define ASM_NOP7 _ASM_MK_NOP(K7_NOP7)
+#define ASM_NOP8 _ASM_MK_NOP(K7_NOP8)
+#define ASM_NOP5_ATOMIC _ASM_MK_NOP(K7_NOP5_ATOMIC)
#elif defined(CONFIG_X86_P6_NOP)
-#define ASM_NOP1 P6_NOP1
-#define ASM_NOP2 P6_NOP2
-#define ASM_NOP3 P6_NOP3
-#define ASM_NOP4 P6_NOP4
-#define ASM_NOP5 P6_NOP5
-#define ASM_NOP6 P6_NOP6
-#define ASM_NOP7 P6_NOP7
-#define ASM_NOP8 P6_NOP8
+#define ASM_NOP1 _ASM_MK_NOP(P6_NOP1)
+#define ASM_NOP2 _ASM_MK_NOP(P6_NOP2)
+#define ASM_NOP3 _ASM_MK_NOP(P6_NOP3)
+#define ASM_NOP4 _ASM_MK_NOP(P6_NOP4)
+#define ASM_NOP5 _ASM_MK_NOP(P6_NOP5)
+#define ASM_NOP6 _ASM_MK_NOP(P6_NOP6)
+#define ASM_NOP7 _ASM_MK_NOP(P6_NOP7)
+#define ASM_NOP8 _ASM_MK_NOP(P6_NOP8)
+#define ASM_NOP5_ATOMIC _ASM_MK_NOP(P6_NOP5_ATOMIC)
#elif defined(CONFIG_X86_64)
-#define ASM_NOP1 K8_NOP1
-#define ASM_NOP2 K8_NOP2
-#define ASM_NOP3 K8_NOP3
-#define ASM_NOP4 K8_NOP4
-#define ASM_NOP5 K8_NOP5
-#define ASM_NOP6 K8_NOP6
-#define ASM_NOP7 K8_NOP7
-#define ASM_NOP8 K8_NOP8
+#define ASM_NOP1 _ASM_MK_NOP(K8_NOP1)
+#define ASM_NOP2 _ASM_MK_NOP(K8_NOP2)
+#define ASM_NOP3 _ASM_MK_NOP(K8_NOP3)
+#define ASM_NOP4 _ASM_MK_NOP(K8_NOP4)
+#define ASM_NOP5 _ASM_MK_NOP(K8_NOP5)
+#define ASM_NOP6 _ASM_MK_NOP(K8_NOP6)
+#define ASM_NOP7 _ASM_MK_NOP(K8_NOP7)
+#define ASM_NOP8 _ASM_MK_NOP(K8_NOP8)
+#define ASM_NOP5_ATOMIC _ASM_MK_NOP(K8_NOP5_ATOMIC)
#else
-#define ASM_NOP1 GENERIC_NOP1
-#define ASM_NOP2 GENERIC_NOP2
-#define ASM_NOP3 GENERIC_NOP3
-#define ASM_NOP4 GENERIC_NOP4
-#define ASM_NOP5 GENERIC_NOP5
-#define ASM_NOP6 GENERIC_NOP6
-#define ASM_NOP7 GENERIC_NOP7
-#define ASM_NOP8 GENERIC_NOP8
+#define ASM_NOP1 _ASM_MK_NOP(GENERIC_NOP1)
+#define ASM_NOP2 _ASM_MK_NOP(GENERIC_NOP2)
+#define ASM_NOP3 _ASM_MK_NOP(GENERIC_NOP3)
+#define ASM_NOP4 _ASM_MK_NOP(GENERIC_NOP4)
+#define ASM_NOP5 _ASM_MK_NOP(GENERIC_NOP5)
+#define ASM_NOP6 _ASM_MK_NOP(GENERIC_NOP6)
+#define ASM_NOP7 _ASM_MK_NOP(GENERIC_NOP7)
+#define ASM_NOP8 _ASM_MK_NOP(GENERIC_NOP8)
+#define ASM_NOP5_ATOMIC _ASM_MK_NOP(GENERIC_NOP5_ATOMIC)
#endif
#define ASM_NOP_MAX 8
+#define NOP_ATOMIC5 (ASM_NOP_MAX+1) /* Entry for the 5-byte atomic NOP */
+
+#ifndef __ASSEMBLY__
+extern const unsigned char * const *ideal_nops;
+extern void arch_init_ideal_nops(void);
+#endif
#endif /* _ASM_X86_NOPS_H */
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 3d4dab43c99..bfacd2ccf65 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -1,12 +1,24 @@
#ifndef _ASM_X86_NUMA_H
#define _ASM_X86_NUMA_H
+#include <linux/nodemask.h>
+
#include <asm/topology.h>
#include <asm/apicdef.h>
#ifdef CONFIG_NUMA
#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
+#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
+
+/*
+ * Too small node sizes may confuse the VM badly. Usually they
+ * result from BIOS bugs. So dont recognize nodes as standalone
+ * NUMA entities that have less than this amount of RAM listed:
+ */
+#define NODE_MIN_SIZE (4*1024*1024)
+
+extern int numa_off;
/*
* __apicid_to_node[] stores the raw mapping between physical apicid and
@@ -17,15 +29,27 @@
* numa_cpu_node().
*/
extern s16 __apicid_to_node[MAX_LOCAL_APIC];
+extern nodemask_t numa_nodes_parsed __initdata;
+
+extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
+extern void __init numa_set_distance(int from, int to, int distance);
static inline void set_apicid_to_node(int apicid, s16 node)
{
__apicid_to_node[apicid] = node;
}
+
+extern int __cpuinit numa_cpu_node(int cpu);
+
#else /* CONFIG_NUMA */
static inline void set_apicid_to_node(int apicid, s16 node)
{
}
+
+static inline int numa_cpu_node(int cpu)
+{
+ return NUMA_NO_NODE;
+}
#endif /* CONFIG_NUMA */
#ifdef CONFIG_X86_32
@@ -37,21 +61,25 @@ static inline void set_apicid_to_node(int apicid, s16 node)
#ifdef CONFIG_NUMA
extern void __cpuinit numa_set_node(int cpu, int node);
extern void __cpuinit numa_clear_node(int cpu);
-extern void __init numa_init_array(void);
extern void __init init_cpu_to_node(void);
extern void __cpuinit numa_add_cpu(int cpu);
extern void __cpuinit numa_remove_cpu(int cpu);
#else /* CONFIG_NUMA */
static inline void numa_set_node(int cpu, int node) { }
static inline void numa_clear_node(int cpu) { }
-static inline void numa_init_array(void) { }
static inline void init_cpu_to_node(void) { }
static inline void numa_add_cpu(int cpu) { }
static inline void numa_remove_cpu(int cpu) { }
#endif /* CONFIG_NUMA */
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
-struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable);
+void debug_cpumask_set_cpu(int cpu, int node, bool enable);
#endif
+#ifdef CONFIG_NUMA_EMU
+#define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
+#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
+void numa_emu_cmdline(char *);
+#endif /* CONFIG_NUMA_EMU */
+
#endif /* _ASM_X86_NUMA_H */
diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h
index c6beed1ef10..e7d6b825474 100644
--- a/arch/x86/include/asm/numa_32.h
+++ b/arch/x86/include/asm/numa_32.h
@@ -1,16 +1,6 @@
#ifndef _ASM_X86_NUMA_32_H
#define _ASM_X86_NUMA_32_H
-extern int numa_off;
-
-extern int pxm_to_nid(int pxm);
-
-#ifdef CONFIG_NUMA
-extern int __cpuinit numa_cpu_node(int cpu);
-#else /* CONFIG_NUMA */
-static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; }
-#endif /* CONFIG_NUMA */
-
#ifdef CONFIG_HIGHMEM
extern void set_highmem_pages_init(void);
#else
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 344eb1790b4..0c05f7ae46e 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -1,42 +1,6 @@
#ifndef _ASM_X86_NUMA_64_H
#define _ASM_X86_NUMA_64_H
-#include <linux/nodemask.h>
-
-struct bootnode {
- u64 start;
- u64 end;
-};
-
-#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
-
-extern int numa_off;
-
extern unsigned long numa_free_all_bootmem(void);
-extern void setup_node_bootmem(int nodeid, unsigned long start,
- unsigned long end);
-
-#ifdef CONFIG_NUMA
-/*
- * Too small node sizes may confuse the VM badly. Usually they
- * result from BIOS bugs. So dont recognize nodes as standalone
- * NUMA entities that have less than this amount of RAM listed:
- */
-#define NODE_MIN_SIZE (4*1024*1024)
-
-extern nodemask_t numa_nodes_parsed __initdata;
-
-extern int __cpuinit numa_cpu_node(int cpu);
-extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
-extern void __init numa_set_distance(int from, int to, int distance);
-
-#ifdef CONFIG_NUMA_EMU
-#define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
-#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
-void numa_emu_cmdline(char *);
-#endif /* CONFIG_NUMA_EMU */
-#else
-static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; }
-#endif
#endif /* _ASM_X86_NUMA_64_H */
diff --git a/arch/x86/include/asm/numaq.h b/arch/x86/include/asm/numaq.h
index 37c516545ec..c3b3c322fd8 100644
--- a/arch/x86/include/asm/numaq.h
+++ b/arch/x86/include/asm/numaq.h
@@ -29,7 +29,7 @@
#ifdef CONFIG_X86_NUMAQ
extern int found_numaq;
-extern int get_memcfg_numaq(void);
+extern int numaq_numa_init(void);
extern int pci_numaq_init(void);
extern void *xquad_portio;
@@ -166,11 +166,6 @@ struct sys_cfg_data {
void numaq_tsc_disable(void);
-#else
-static inline int get_memcfg_numaq(void)
-{
- return 0;
-}
#endif /* CONFIG_X86_NUMAQ */
#endif /* _ASM_X86_NUMAQ_H */
diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h
index 5ca6801b75f..87bdbca72f9 100644
--- a/arch/x86/include/asm/olpc.h
+++ b/arch/x86/include/asm/olpc.h
@@ -13,6 +13,7 @@ struct olpc_platform_t {
#define OLPC_F_PRESENT 0x01
#define OLPC_F_DCON 0x02
+#define OLPC_F_EC_WIDE_SCI 0x04
#ifdef CONFIG_OLPC
@@ -62,6 +63,13 @@ static inline int olpc_board_at_least(uint32_t rev)
return olpc_platform_info.boardrev >= rev;
}
+extern void olpc_ec_wakeup_set(u16 value);
+extern void olpc_ec_wakeup_clear(u16 value);
+extern bool olpc_ec_wakeup_available(void);
+
+extern int olpc_ec_mask_write(u16 bits);
+extern int olpc_ec_sci_query(u16 *sci_value);
+
#else
static inline int machine_is_olpc(void)
@@ -74,6 +82,20 @@ static inline int olpc_has_dcon(void)
return 0;
}
+static inline void olpc_ec_wakeup_set(u16 value) { }
+static inline void olpc_ec_wakeup_clear(u16 value) { }
+
+static inline bool olpc_ec_wakeup_available(void)
+{
+ return false;
+}
+
+#endif
+
+#ifdef CONFIG_OLPC_XO1_PM
+extern void do_olpc_suspend_lowlevel(void);
+extern void olpc_xo1_pm_wakeup_set(u16 value);
+extern void olpc_xo1_pm_wakeup_clear(u16 value);
#endif
extern int pci_olpc_init(void);
@@ -83,14 +105,19 @@ extern int pci_olpc_init(void);
extern int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen,
unsigned char *outbuf, size_t outlen);
-extern int olpc_ec_mask_set(uint8_t bits);
-extern int olpc_ec_mask_unset(uint8_t bits);
-
/* EC commands */
-#define EC_FIRMWARE_REV 0x08
-#define EC_WLAN_ENTER_RESET 0x35
-#define EC_WLAN_LEAVE_RESET 0x25
+#define EC_FIRMWARE_REV 0x08
+#define EC_WRITE_SCI_MASK 0x1b
+#define EC_WAKE_UP_WLAN 0x24
+#define EC_WLAN_LEAVE_RESET 0x25
+#define EC_READ_EB_MODE 0x2a
+#define EC_SET_SCI_INHIBIT 0x32
+#define EC_SET_SCI_INHIBIT_RELEASE 0x34
+#define EC_WLAN_ENTER_RESET 0x35
+#define EC_WRITE_EXT_SCI_MASK 0x38
+#define EC_SCI_QUERY 0x84
+#define EC_EXT_SCI_QUERY 0x85
/* SCI source values */
@@ -99,10 +126,12 @@ extern int olpc_ec_mask_unset(uint8_t bits);
#define EC_SCI_SRC_BATTERY 0x02
#define EC_SCI_SRC_BATSOC 0x04
#define EC_SCI_SRC_BATERR 0x08
-#define EC_SCI_SRC_EBOOK 0x10
-#define EC_SCI_SRC_WLAN 0x20
+#define EC_SCI_SRC_EBOOK 0x10 /* XO-1 only */
+#define EC_SCI_SRC_WLAN 0x20 /* XO-1 only */
#define EC_SCI_SRC_ACPWR 0x40
-#define EC_SCI_SRC_ALL 0x7F
+#define EC_SCI_SRC_BATCRIT 0x80
+#define EC_SCI_SRC_GPWAKE 0x100 /* XO-1.5 only */
+#define EC_SCI_SRC_ALL 0x1FF
/* GPIO assignments */
@@ -116,7 +145,7 @@ extern int olpc_ec_mask_unset(uint8_t bits);
#define OLPC_GPIO_SMB_CLK 14
#define OLPC_GPIO_SMB_DATA 15
#define OLPC_GPIO_WORKAUX geode_gpio(24)
-#define OLPC_GPIO_LID geode_gpio(26)
-#define OLPC_GPIO_ECSCI geode_gpio(27)
+#define OLPC_GPIO_LID 26
+#define OLPC_GPIO_ECSCI 27
#endif /* _ASM_X86_OLPC_H */
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
index c5d3a5abbb9..24487712e0b 100644
--- a/arch/x86/include/asm/olpc_ofw.h
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -26,15 +26,12 @@ extern void setup_olpc_ofw_pgd(void);
/* check if OFW was detected during boot */
extern bool olpc_ofw_present(void);
+extern void olpc_dt_build_devicetree(void);
+
#else /* !CONFIG_OLPC */
static inline void olpc_ofw_detect(void) { }
static inline void setup_olpc_ofw_pgd(void) { }
-#endif /* !CONFIG_OLPC */
-
-#ifdef CONFIG_OF_PROMTREE
-extern void olpc_dt_build_devicetree(void);
-#else
static inline void olpc_dt_build_devicetree(void) { }
-#endif
+#endif /* !CONFIG_OLPC */
#endif /* _ASM_X86_OLPC_OFW_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index ebbc4d8ab17..a7d2db9a74f 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -230,6 +230,15 @@ static inline unsigned long long paravirt_sched_clock(void)
return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
}
+struct jump_label_key;
+extern struct jump_label_key paravirt_steal_enabled;
+extern struct jump_label_key paravirt_steal_rq_enabled;
+
+static inline u64 paravirt_steal_clock(int cpu)
+{
+ return PVOP_CALL1(u64, pv_time_ops.steal_clock, cpu);
+}
+
static inline unsigned long long paravirt_read_pmc(int counter)
{
return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 82885099c86..2c765216311 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -89,6 +89,7 @@ struct pv_lazy_ops {
struct pv_time_ops {
unsigned long long (*sched_clock)(void);
+ unsigned long long (*steal_clock)(int cpu);
unsigned long (*get_tsc_khz)(void);
};
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 67612922963..d498943b906 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -135,8 +135,6 @@ void default_teardown_msi_irqs(struct pci_dev *dev);
#include "pci_64.h"
#endif
-void dma32_reserve_bootmem(void);
-
/* implement the pci_ DMA API in terms of the generic device dma_ one */
#include <asm-generic/pci-dma-compat.h>
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index d475b4398d8..3470c9d0ebb 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -388,12 +388,9 @@ do { \
#define __this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val)
#define __this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
#define __this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
-/*
- * Generic fallback operations for __this_cpu_xchg_[1-4] are okay and much
- * faster than an xchg with forced lock semantics.
- */
-#define __this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
-#define __this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
+#define __this_cpu_xchg_1(pcp, val) percpu_xchg_op(pcp, val)
+#define __this_cpu_xchg_2(pcp, val) percpu_xchg_op(pcp, val)
+#define __this_cpu_xchg_4(pcp, val) percpu_xchg_op(pcp, val)
#define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
#define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
@@ -485,6 +482,8 @@ do { \
#define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
#define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
#define __this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val)
+#define __this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
+#define __this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val)
@@ -509,6 +508,11 @@ do { \
* it in software. The address used in the cmpxchg16 instruction must be
* aligned to a 16 byte boundary.
*/
+#ifdef CONFIG_SMP
+#define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP3
+#else
+#define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP2
+#endif
#define percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) \
({ \
char __ret; \
@@ -517,7 +521,7 @@ do { \
typeof(o2) __o2 = o2; \
typeof(o2) __n2 = n2; \
typeof(o2) __dummy; \
- alternative_io("call this_cpu_cmpxchg16b_emu\n\t" P6_NOP4, \
+ alternative_io(CMPXCHG16B_EMU_CALL, \
"cmpxchg16b " __percpu_prefix "(%%rsi)\n\tsetz %0\n\t", \
X86_FEATURE_CX16, \
ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)), \
@@ -542,6 +546,33 @@ do { \
old__; \
})
+static __always_inline int x86_this_cpu_constant_test_bit(unsigned int nr,
+ const unsigned long __percpu *addr)
+{
+ unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG;
+
+ return ((1UL << (nr % BITS_PER_LONG)) & percpu_read(*a)) != 0;
+}
+
+static inline int x86_this_cpu_variable_test_bit(int nr,
+ const unsigned long __percpu *addr)
+{
+ int oldbit;
+
+ asm volatile("bt "__percpu_arg(2)",%1\n\t"
+ "sbb %0,%0"
+ : "=r" (oldbit)
+ : "m" (*(unsigned long *)addr), "Ir" (nr));
+
+ return oldbit;
+}
+
+#define x86_this_cpu_test_bit(nr, addr) \
+ (__builtin_constant_p((nr)) \
+ ? x86_this_cpu_constant_test_bit((nr), (addr)) \
+ : x86_this_cpu_variable_test_bit((nr), (addr)))
+
+
#include <asm-generic/percpu.h>
/* We can use this directly for local CPU (faster). */
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index d9d4dae305f..094fb30817a 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -152,6 +152,11 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs);
(regs)->bp = caller_frame_pointer(); \
(regs)->cs = __KERNEL_CS; \
regs->flags = 0; \
+ asm volatile( \
+ _ASM_MOV "%%"_ASM_SP ", %0\n" \
+ : "=m" ((regs)->sp) \
+ :: "memory" \
+ ); \
}
#else
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index 56fd9e3abbd..4f7e67e2345 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -102,6 +102,14 @@
#define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT)
/*
+ * If an event has alias it should be marked
+ * with a special bit. (Don't forget to check
+ * P4_PEBS_CONFIG_MASK and related bits on
+ * modification.)
+ */
+#define P4_CONFIG_ALIASABLE (1 << 9)
+
+/*
* The bits we allow to pass for RAW events
*/
#define P4_CONFIG_MASK_ESCR \
@@ -123,6 +131,31 @@
(p4_config_pack_escr(P4_CONFIG_MASK_ESCR)) | \
(p4_config_pack_cccr(P4_CONFIG_MASK_CCCR))
+/*
+ * In case of event aliasing we need to preserve some
+ * caller bits, otherwise the mapping won't be complete.
+ */
+#define P4_CONFIG_EVENT_ALIAS_MASK \
+ (p4_config_pack_escr(P4_CONFIG_MASK_ESCR) | \
+ p4_config_pack_cccr(P4_CCCR_EDGE | \
+ P4_CCCR_THRESHOLD_MASK | \
+ P4_CCCR_COMPLEMENT | \
+ P4_CCCR_COMPARE))
+
+#define P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS \
+ ((P4_CONFIG_HT) | \
+ p4_config_pack_escr(P4_ESCR_T0_OS | \
+ P4_ESCR_T0_USR | \
+ P4_ESCR_T1_OS | \
+ P4_ESCR_T1_USR) | \
+ p4_config_pack_cccr(P4_CCCR_OVF | \
+ P4_CCCR_CASCADE | \
+ P4_CCCR_FORCE_OVF | \
+ P4_CCCR_THREAD_ANY | \
+ P4_CCCR_OVF_PMI_T0 | \
+ P4_CCCR_OVF_PMI_T1 | \
+ P4_CONFIG_ALIASABLE))
+
static inline bool p4_is_event_cascaded(u64 config)
{
u32 cccr = p4_config_unpack_cccr(config);
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 7db7723d1f3..013286a10c2 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -107,7 +107,8 @@
#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD)
#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
-#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER)
+#define __PAGE_KERNEL_VVAR_NOCACHE (__PAGE_KERNEL_VVAR | _PAGE_PCD | _PAGE_PWT)
#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
@@ -129,7 +130,8 @@
#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE)
#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)
-#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE)
+#define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR)
+#define PAGE_KERNEL_VVAR_NOCACHE __pgprot(__PAGE_KERNEL_VVAR_NOCACHE)
#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO)
#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE)
@@ -299,6 +301,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
/* Install a pte for a particular vaddr in kernel space. */
void set_pte_vaddr(unsigned long vaddr, pte_t pte);
+extern void native_pagetable_reserve(u64 start, u64 end);
#ifdef CONFIG_X86_32
extern void native_pagetable_setup_start(pgd_t *base);
extern void native_pagetable_setup_done(pgd_t *base);
diff --git a/arch/x86/include/asm/probe_roms.h b/arch/x86/include/asm/probe_roms.h
new file mode 100644
index 00000000000..4950a0b1d09
--- /dev/null
+++ b/arch/x86/include/asm/probe_roms.h
@@ -0,0 +1,8 @@
+#ifndef _PROBE_ROMS_H_
+#define _PROBE_ROMS_H_
+struct pci_dev;
+
+extern void __iomem *pci_map_biosrom(struct pci_dev *pdev);
+extern void pci_unmap_biosrom(void __iomem *rom);
+extern size_t pci_biosrom_size(struct pci_dev *pdev);
+#endif
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index a898a2b6e10..2dddb317bb3 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -59,7 +59,9 @@
#define X86_CR4_OSFXSR 0x00000200 /* enable fast FPU save and restore */
#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */
#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */
+#define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */
#define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */
+#define X86_CR4_SMEP 0x00100000 /* enable SMEP support */
/*
* x86-64 Task Priority Register, CR8
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 4c25ab48257..219371546af 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -754,10 +754,10 @@ static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
extern void select_idle_routine(const struct cpuinfo_x86 *c);
-extern void init_c1e_mask(void);
+extern void init_amd_e400_c1e_mask(void);
extern unsigned long boot_option_idle_override;
-extern bool c1e_detected;
+extern bool amd_e400_c1e_detected;
enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT,
IDLE_POLL, IDLE_FORCE_MWAIT};
diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
index 971e0b46446..644dd885f05 100644
--- a/arch/x86/include/asm/prom.h
+++ b/arch/x86/include/asm/prom.h
@@ -19,7 +19,7 @@
#include <linux/pci.h>
#include <asm/irq.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <asm/setup.h>
#include <asm/irq_controller.h>
@@ -30,17 +30,6 @@ extern void add_dtb(u64 data);
extern void x86_add_irq_domains(void);
void __cpuinit x86_of_pci_init(void);
void x86_dtb_init(void);
-
-static inline struct device_node *pci_device_to_OF_node(struct pci_dev *pdev)
-{
- return pdev ? pdev->dev.of_node : NULL;
-}
-
-static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus)
-{
- return pci_device_to_OF_node(bus->self);
-}
-
#else
static inline void add_dtb(u64 data) { }
static inline void x86_add_irq_domains(void) { }
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 1babf8adecd..94e7618fcac 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -136,6 +136,7 @@ struct cpuinfo_x86;
struct task_struct;
extern unsigned long profile_pc(struct pt_regs *regs);
+#define profile_pc profile_pc
extern unsigned long
convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs);
@@ -202,20 +203,11 @@ static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
#endif
}
-static inline unsigned long instruction_pointer(struct pt_regs *regs)
-{
- return regs->ip;
-}
-
-static inline unsigned long frame_pointer(struct pt_regs *regs)
-{
- return regs->bp;
-}
+#define GET_IP(regs) ((regs)->ip)
+#define GET_FP(regs) ((regs)->bp)
+#define GET_USP(regs) ((regs)->sp)
-static inline unsigned long user_stack_pointer(struct pt_regs *regs)
-{
- return regs->sp;
-}
+#include <asm-generic/ptrace.h>
/* Query offset/name of register from its name/offset */
extern int regs_query_register_offset(const char *name);
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 31d84acc151..a518c0a4504 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -22,6 +22,8 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
u64 product;
#ifdef __i386__
u32 tmp1, tmp2;
+#else
+ ulong tmp;
#endif
if (shift < 0)
@@ -42,8 +44,11 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
#elif defined(__x86_64__)
__asm__ (
- "mul %%rdx ; shrd $32,%%rdx,%%rax"
- : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+ "mul %[mul_frac] ; shrd $32, %[hi], %[lo]"
+ : [lo]"=a"(product),
+ [hi]"=d"(tmp)
+ : "0"(delta),
+ [mul_frac]"rm"((u64)mul_frac));
#else
#error implement me!
#endif
diff --git a/arch/x86/include/asm/rwlock.h b/arch/x86/include/asm/rwlock.h
index 6a8c0d64510..a5370a03d90 100644
--- a/arch/x86/include/asm/rwlock.h
+++ b/arch/x86/include/asm/rwlock.h
@@ -1,7 +1,48 @@
#ifndef _ASM_X86_RWLOCK_H
#define _ASM_X86_RWLOCK_H
-#define RW_LOCK_BIAS 0x01000000
+#include <asm/asm.h>
+
+#if CONFIG_NR_CPUS <= 2048
+
+#ifndef __ASSEMBLY__
+typedef union {
+ s32 lock;
+ s32 write;
+} arch_rwlock_t;
+#endif
+
+#define RW_LOCK_BIAS 0x00100000
+#define READ_LOCK_SIZE(insn) __ASM_FORM(insn##l)
+#define READ_LOCK_ATOMIC(n) atomic_##n
+#define WRITE_LOCK_ADD(n) __ASM_FORM_COMMA(addl n)
+#define WRITE_LOCK_SUB(n) __ASM_FORM_COMMA(subl n)
+#define WRITE_LOCK_CMP RW_LOCK_BIAS
+
+#else /* CONFIG_NR_CPUS > 2048 */
+
+#include <linux/const.h>
+
+#ifndef __ASSEMBLY__
+typedef union {
+ s64 lock;
+ struct {
+ u32 read;
+ s32 write;
+ };
+} arch_rwlock_t;
+#endif
+
+#define RW_LOCK_BIAS (_AC(1,L) << 32)
+#define READ_LOCK_SIZE(insn) __ASM_FORM(insn##q)
+#define READ_LOCK_ATOMIC(n) atomic64_##n
+#define WRITE_LOCK_ADD(n) __ASM_FORM(incl)
+#define WRITE_LOCK_SUB(n) __ASM_FORM(decl)
+#define WRITE_LOCK_CMP 1
+
+#endif /* CONFIG_NR_CPUS */
+
+#define __ARCH_RW_LOCK_UNLOCKED { RW_LOCK_BIAS }
/* Actual code is in asm/spinlock.h or in arch/x86/lib/rwlock.S */
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index cd84f7208f7..5e641715c3f 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -162,7 +162,7 @@
#define GDT_ENTRY_DEFAULT_USER32_CS 4
#define GDT_ENTRY_DEFAULT_USER_DS 5
#define GDT_ENTRY_DEFAULT_USER_CS 6
-#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS * 8 + 3)
+#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8+3)
#define __USER32_DS __USER_DS
#define GDT_ENTRY_TSS 8 /* needs two entries */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index db8aa19a08a..9756551ec76 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -88,7 +88,7 @@ void *extend_brk(size_t size, size_t align);
* executable.)
*/
#define RESERVE_BRK(name,sz) \
- static void __section(.discard.text) __used \
+ static void __section(.discard.text) __used notrace \
__brk_reservation_fn_##name##__(void) { \
asm volatile ( \
".pushsection .brk_reservation,\"aw\",@nobits;" \
@@ -104,10 +104,10 @@ void *extend_brk(size_t size, size_t align);
type *name; \
RESERVE_BRK(name, sizeof(type) * entries)
+extern void probe_roms(void);
#ifdef __i386__
void __init i386_start_kernel(void);
-extern void probe_roms(void);
#else
void __init x86_64_start_kernel(char *real_mode);
diff --git a/arch/x86/include/asm/smpboot_hooks.h b/arch/x86/include/asm/smpboot_hooks.h
index 725b7783199..49adfd7bb4a 100644
--- a/arch/x86/include/asm/smpboot_hooks.h
+++ b/arch/x86/include/asm/smpboot_hooks.h
@@ -10,7 +10,11 @@ static inline void smpboot_clear_io_apic_irqs(void)
static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rtc_lock, flags);
CMOS_WRITE(0xa, 0xf);
+ spin_unlock_irqrestore(&rtc_lock, flags);
local_flush_tlb();
pr_debug("1.\n");
*((volatile unsigned short *)phys_to_virt(apic->trampoline_phys_high)) =
@@ -23,6 +27,8 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
static inline void smpboot_restore_warm_reset_vector(void)
{
+ unsigned long flags;
+
/*
* Install writable page 0 entry to set BIOS data area.
*/
@@ -32,7 +38,9 @@ static inline void smpboot_restore_warm_reset_vector(void)
* Paranoid: Set warm reset code and vector here back
* to default values.
*/
+ spin_lock_irqsave(&rtc_lock, flags);
CMOS_WRITE(0, 0xf);
+ spin_unlock_irqrestore(&rtc_lock, flags);
*((volatile u32 *)phys_to_virt(apic->trampoline_phys_low)) = 0;
}
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 3089f70c0c5..ee67edf86fd 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -1,8 +1,7 @@
#ifndef _ASM_X86_SPINLOCK_H
#define _ASM_X86_SPINLOCK_H
-#include <asm/atomic.h>
-#include <asm/rwlock.h>
+#include <linux/atomic.h>
#include <asm/page.h>
#include <asm/processor.h>
#include <linux/compiler.h>
@@ -234,7 +233,7 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
*/
static inline int arch_read_can_lock(arch_rwlock_t *lock)
{
- return (int)(lock)->lock > 0;
+ return lock->lock > 0;
}
/**
@@ -243,12 +242,12 @@ static inline int arch_read_can_lock(arch_rwlock_t *lock)
*/
static inline int arch_write_can_lock(arch_rwlock_t *lock)
{
- return (lock)->lock == RW_LOCK_BIAS;
+ return lock->write == WRITE_LOCK_CMP;
}
static inline void arch_read_lock(arch_rwlock_t *rw)
{
- asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t"
+ asm volatile(LOCK_PREFIX READ_LOCK_SIZE(dec) " (%0)\n\t"
"jns 1f\n"
"call __read_lock_failed\n\t"
"1:\n"
@@ -257,47 +256,55 @@ static inline void arch_read_lock(arch_rwlock_t *rw)
static inline void arch_write_lock(arch_rwlock_t *rw)
{
- asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t"
+ asm volatile(LOCK_PREFIX WRITE_LOCK_SUB(%1) "(%0)\n\t"
"jz 1f\n"
"call __write_lock_failed\n\t"
"1:\n"
- ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory");
+ ::LOCK_PTR_REG (&rw->write), "i" (RW_LOCK_BIAS)
+ : "memory");
}
static inline int arch_read_trylock(arch_rwlock_t *lock)
{
- atomic_t *count = (atomic_t *)lock;
+ READ_LOCK_ATOMIC(t) *count = (READ_LOCK_ATOMIC(t) *)lock;
- if (atomic_dec_return(count) >= 0)
+ if (READ_LOCK_ATOMIC(dec_return)(count) >= 0)
return 1;
- atomic_inc(count);
+ READ_LOCK_ATOMIC(inc)(count);
return 0;
}
static inline int arch_write_trylock(arch_rwlock_t *lock)
{
- atomic_t *count = (atomic_t *)lock;
+ atomic_t *count = (atomic_t *)&lock->write;
- if (atomic_sub_and_test(RW_LOCK_BIAS, count))
+ if (atomic_sub_and_test(WRITE_LOCK_CMP, count))
return 1;
- atomic_add(RW_LOCK_BIAS, count);
+ atomic_add(WRITE_LOCK_CMP, count);
return 0;
}
static inline void arch_read_unlock(arch_rwlock_t *rw)
{
- asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory");
+ asm volatile(LOCK_PREFIX READ_LOCK_SIZE(inc) " %0"
+ :"+m" (rw->lock) : : "memory");
}
static inline void arch_write_unlock(arch_rwlock_t *rw)
{
- asm volatile(LOCK_PREFIX "addl %1, %0"
- : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory");
+ asm volatile(LOCK_PREFIX WRITE_LOCK_ADD(%1) "%0"
+ : "+m" (rw->write) : "i" (RW_LOCK_BIAS) : "memory");
}
#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
+#undef READ_LOCK_SIZE
+#undef READ_LOCK_ATOMIC
+#undef WRITE_LOCK_ADD
+#undef WRITE_LOCK_SUB
+#undef WRITE_LOCK_CMP
+
#define arch_spin_relax(lock) cpu_relax()
#define arch_read_relax(lock) cpu_relax()
#define arch_write_relax(lock) cpu_relax()
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
index dcb48b2edc1..7c7a486fcb6 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -11,10 +11,6 @@ typedef struct arch_spinlock {
#define __ARCH_SPIN_LOCK_UNLOCKED { 0 }
-typedef struct {
- unsigned int lock;
-} arch_rwlock_t;
-
-#define __ARCH_RW_LOCK_UNLOCKED { RW_LOCK_BIAS }
+#include <asm/rwlock.h>
#endif /* _ASM_X86_SPINLOCK_TYPES_H */
diff --git a/arch/x86/include/asm/srat.h b/arch/x86/include/asm/srat.h
deleted file mode 100644
index b508d639d1a..00000000000
--- a/arch/x86/include/asm/srat.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Some of the code in this file has been gleaned from the 64 bit
- * discontigmem support code base.
- *
- * Copyright (C) 2002, IBM Corp.
- *
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to Pat Gaughen <gone@us.ibm.com>
- */
-
-#ifndef _ASM_X86_SRAT_H
-#define _ASM_X86_SRAT_H
-
-#ifdef CONFIG_ACPI_NUMA
-extern int get_memcfg_from_srat(void);
-#else
-static inline int get_memcfg_from_srat(void)
-{
- return 0;
-}
-#endif
-
-#endif /* _ASM_X86_SRAT_H */
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index d7e89c83645..70bbe39043a 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -37,9 +37,6 @@ print_context_stack_bp(struct thread_info *tinfo,
/* Generic stack tracer with callbacks */
struct stacktrace_ops {
- void (*warning)(void *data, char *msg);
- /* msg must contain %s for the symbol */
- void (*warning_symbol)(void *data, char *msg, unsigned long symbol);
void (*address)(void *data, unsigned long address, int reliable);
/* On negative return stop dumping */
int (*stack)(void *data, char *name);
diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h
index fd921c3a684..487055c8c1a 100644
--- a/arch/x86/include/asm/suspend_32.h
+++ b/arch/x86/include/asm/suspend_32.h
@@ -9,8 +9,6 @@
#include <asm/desc.h>
#include <asm/i387.h>
-static inline int arch_prepare_suspend(void) { return 0; }
-
/* image of the saved processor state */
struct saved_context {
u16 es, fs, gs, ss;
diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h
index 8d942afae68..09b0bf10415 100644
--- a/arch/x86/include/asm/suspend_64.h
+++ b/arch/x86/include/asm/suspend_64.h
@@ -9,11 +9,6 @@
#include <asm/desc.h>
#include <asm/i387.h>
-static inline int arch_prepare_suspend(void)
-{
- return 0;
-}
-
/*
* Image of the saved processor state, used by the low level ACPI suspend to
* RAM code and by the low level hibernation code.
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index 12569e691ce..c2ff2a1d845 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -303,24 +303,81 @@ static inline void native_wbinvd(void)
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else
-#define read_cr0() (native_read_cr0())
-#define write_cr0(x) (native_write_cr0(x))
-#define read_cr2() (native_read_cr2())
-#define write_cr2(x) (native_write_cr2(x))
-#define read_cr3() (native_read_cr3())
-#define write_cr3(x) (native_write_cr3(x))
-#define read_cr4() (native_read_cr4())
-#define read_cr4_safe() (native_read_cr4_safe())
-#define write_cr4(x) (native_write_cr4(x))
-#define wbinvd() (native_wbinvd())
+
+static inline unsigned long read_cr0(void)
+{
+ return native_read_cr0();
+}
+
+static inline void write_cr0(unsigned long x)
+{
+ native_write_cr0(x);
+}
+
+static inline unsigned long read_cr2(void)
+{
+ return native_read_cr2();
+}
+
+static inline void write_cr2(unsigned long x)
+{
+ native_write_cr2(x);
+}
+
+static inline unsigned long read_cr3(void)
+{
+ return native_read_cr3();
+}
+
+static inline void write_cr3(unsigned long x)
+{
+ native_write_cr3(x);
+}
+
+static inline unsigned long read_cr4(void)
+{
+ return native_read_cr4();
+}
+
+static inline unsigned long read_cr4_safe(void)
+{
+ return native_read_cr4_safe();
+}
+
+static inline void write_cr4(unsigned long x)
+{
+ native_write_cr4(x);
+}
+
+static inline void wbinvd(void)
+{
+ native_wbinvd();
+}
+
#ifdef CONFIG_X86_64
-#define read_cr8() (native_read_cr8())
-#define write_cr8(x) (native_write_cr8(x))
-#define load_gs_index native_load_gs_index
+
+static inline unsigned long read_cr8(void)
+{
+ return native_read_cr8();
+}
+
+static inline void write_cr8(unsigned long x)
+{
+ native_write_cr8(x);
+}
+
+static inline void load_gs_index(unsigned selector)
+{
+ native_load_gs_index(selector);
+}
+
#endif
/* Clear the 'TS' bit */
-#define clts() (native_clts())
+static inline void clts(void)
+{
+ native_clts();
+}
#endif/* CONFIG_PARAVIRT */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 1f2e61e2898..a1fe5c127b5 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -21,7 +21,7 @@ struct task_struct;
struct exec_domain;
#include <asm/processor.h>
#include <asm/ftrace.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
struct thread_info {
struct task_struct *task; /* main task structure */
diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h
index 7bdec4e9b73..92b8aec0697 100644
--- a/arch/x86/include/asm/time.h
+++ b/arch/x86/include/asm/time.h
@@ -1,10 +1,12 @@
#ifndef _ASM_X86_TIME_H
#define _ASM_X86_TIME_H
-extern void hpet_time_init(void);
-
+#include <linux/clocksource.h>
#include <asm/mc146818rtc.h>
+extern void hpet_time_init(void);
extern void time_init(void);
+extern struct clock_event_device *global_clock_event;
+
#endif /* _ASM_X86_TIME_H */
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 910a7084f7f..c00692476e9 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -93,19 +93,11 @@ extern void setup_node_to_cpumask_map(void);
#define pcibus_to_node(bus) __pcibus_to_node(bus)
#ifdef CONFIG_X86_32
-extern unsigned long node_start_pfn[];
-extern unsigned long node_end_pfn[];
-extern unsigned long node_remap_size[];
-#define node_has_online_mem(nid) (node_start_pfn[nid] != node_end_pfn[nid])
-
# define SD_CACHE_NICE_TRIES 1
# define SD_IDLE_IDX 1
-
#else
-
# define SD_CACHE_NICE_TRIES 2
# define SD_IDLE_IDX 2
-
#endif
/* sched_domains SD_NODE_INIT for NUMA machines */
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 0310da67307..2bae0a513b4 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -1,6 +1,8 @@
#ifndef _ASM_X86_TRAPS_H
#define _ASM_X86_TRAPS_H
+#include <linux/kprobes.h>
+
#include <asm/debugreg.h>
#include <asm/siginfo.h> /* TRAP_TRACE, ... */
@@ -38,6 +40,7 @@ asmlinkage void alignment_check(void);
asmlinkage void machine_check(void);
#endif /* CONFIG_X86_MCE */
asmlinkage void simd_coprocessor_error(void);
+asmlinkage void emulate_vsyscall(void);
dotraplinkage void do_divide_error(struct pt_regs *, long);
dotraplinkage void do_debug(struct pt_regs *, long);
@@ -64,6 +67,7 @@ dotraplinkage void do_alignment_check(struct pt_regs *, long);
dotraplinkage void do_machine_check(struct pt_regs *, long);
#endif
dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long);
+dotraplinkage void do_emulate_vsyscall(struct pt_regs *, long);
#ifdef CONFIG_X86_32
dotraplinkage void do_iret_error(struct pt_regs *, long);
#endif
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index abd3e0ea762..36361bf6fdd 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -6,7 +6,6 @@
#include <linux/errno.h>
#include <linux/compiler.h>
#include <linux/thread_info.h>
-#include <linux/prefetch.h>
#include <linux/string.h>
#include <asm/asm.h>
#include <asm/page.h>
@@ -42,7 +41,7 @@
* Returns 0 if the range is valid, nonzero otherwise.
*
* This is equivalent to the following test:
- * (u33)addr + (u33)size >= (u33)current->addr_limit.seg (u65 for x86_64)
+ * (u33)addr + (u33)size > (u33)current->addr_limit.seg (u65 for x86_64)
*
* This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry...
*/
@@ -556,6 +555,9 @@ struct __large_struct { unsigned long buf[100]; };
#endif /* CONFIG_X86_WP_WORKS_OK */
+extern unsigned long
+copy_from_user_nmi(void *to, const void __user *from, unsigned long n);
+
/*
* movsl can be slow when source and dest are not both 8-byte aligned
*/
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 088d09fb161..566e803cc60 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -6,7 +6,6 @@
*/
#include <linux/errno.h>
#include <linux/thread_info.h>
-#include <linux/prefetch.h>
#include <linux/string.h>
#include <asm/asm.h>
#include <asm/page.h>
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 316708d5af9..1c66d30971a 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -6,7 +6,6 @@
*/
#include <linux/compiler.h>
#include <linux/errno.h>
-#include <linux/prefetch.h>
#include <linux/lockdep.h>
#include <asm/alternative.h>
#include <asm/cpufeature.h>
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index a755ef5e597..593485b38ab 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -350,10 +350,12 @@
#define __NR_open_by_handle_at 342
#define __NR_clock_adjtime 343
#define __NR_syncfs 344
+#define __NR_sendmmsg 345
+#define __NR_setns 346
#ifdef __KERNEL__
-#define NR_syscalls 345
+#define NR_syscalls 347
#define __ARCH_WANT_IPC_PARSE_VERSION
#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 160fa76bd57..705bf139288 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -677,6 +677,10 @@ __SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at)
__SYSCALL(__NR_clock_adjtime, sys_clock_adjtime)
#define __NR_syncfs 306
__SYSCALL(__NR_syncfs, sys_syncfs)
+#define __NR_sendmmsg 307
+__SYSCALL(__NR_sendmmsg, sys_sendmmsg)
+#define __NR_setns 308
+__SYSCALL(__NR_setns, sys_setns)
#ifndef __NO_STUBS
#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 3e094af443c..37d369859c8 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -5,7 +5,7 @@
*
* SGI UV Broadcast Assist Unit definitions
*
- * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2008-2011 Silicon Graphics, Inc. All rights reserved.
*/
#ifndef _ASM_X86_UV_UV_BAU_H
@@ -35,17 +35,20 @@
#define MAX_CPUS_PER_UVHUB 64
#define MAX_CPUS_PER_SOCKET 32
-#define UV_ADP_SIZE 64 /* hardware-provided max. */
-#define UV_CPUS_PER_ACT_STATUS 32 /* hardware-provided max. */
-#define UV_ITEMS_PER_DESCRIPTOR 8
+#define ADP_SZ 64 /* hardware-provided max. */
+#define UV_CPUS_PER_AS 32 /* hardware-provided max. */
+#define ITEMS_PER_DESC 8
/* the 'throttle' to prevent the hardware stay-busy bug */
#define MAX_BAU_CONCURRENT 3
#define UV_ACT_STATUS_MASK 0x3
#define UV_ACT_STATUS_SIZE 2
#define UV_DISTRIBUTION_SIZE 256
#define UV_SW_ACK_NPENDING 8
-#define UV_NET_ENDPOINT_INTD 0x38
-#define UV_DESC_BASE_PNODE_SHIFT 49
+#define UV1_NET_ENDPOINT_INTD 0x38
+#define UV2_NET_ENDPOINT_INTD 0x28
+#define UV_NET_ENDPOINT_INTD (is_uv1_hub() ? \
+ UV1_NET_ENDPOINT_INTD : UV2_NET_ENDPOINT_INTD)
+#define UV_DESC_PSHIFT 49
#define UV_PAYLOADQ_PNODE_SHIFT 49
#define UV_PTC_BASENAME "sgi_uv/ptc_statistics"
#define UV_BAU_BASENAME "sgi_uv/bau_tunables"
@@ -53,29 +56,72 @@
#define UV_BAU_TUNABLES_FILE "bau_tunables"
#define WHITESPACE " \t\n"
#define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask))
-#define UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT 15
-#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT 16
-#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x0000000009UL
+#define cpubit_isset(cpu, bau_local_cpumask) \
+ test_bit((cpu), (bau_local_cpumask).bits)
+
/* [19:16] SOFT_ACK timeout period 19: 1 is urgency 7 17:16 1 is multiplier */
-#define BAU_MISC_CONTROL_MULT_MASK 3
+/*
+ * UV2: Bit 19 selects between
+ * (0): 10 microsecond timebase and
+ * (1): 80 microseconds
+ * we're using 655us, similar to UV1: 65 units of 10us
+ */
+#define UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD (9UL)
+#define UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD (15UL)
+
+#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD (is_uv1_hub() ? \
+ UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD : \
+ UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD)
-#define UVH_AGING_PRESCALE_SEL 0x000000b000UL
+#define BAU_MISC_CONTROL_MULT_MASK 3
+
+#define UVH_AGING_PRESCALE_SEL 0x000000b000UL
/* [30:28] URGENCY_7 an index into a table of times */
-#define BAU_URGENCY_7_SHIFT 28
-#define BAU_URGENCY_7_MASK 7
+#define BAU_URGENCY_7_SHIFT 28
+#define BAU_URGENCY_7_MASK 7
-#define UVH_TRANSACTION_TIMEOUT 0x000000b200UL
+#define UVH_TRANSACTION_TIMEOUT 0x000000b200UL
/* [45:40] BAU - BAU transaction timeout select - a multiplier */
-#define BAU_TRANS_SHIFT 40
-#define BAU_TRANS_MASK 0x3f
+#define BAU_TRANS_SHIFT 40
+#define BAU_TRANS_MASK 0x3f
+
+/*
+ * shorten some awkward names
+ */
+#define AS_PUSH_SHIFT UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT
+#define SOFTACK_MSHIFT UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT
+#define SOFTACK_PSHIFT UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT
+#define SOFTACK_TIMEOUT_PERIOD UV_INTD_SOFT_ACK_TIMEOUT_PERIOD
+#define write_gmmr uv_write_global_mmr64
+#define write_lmmr uv_write_local_mmr
+#define read_lmmr uv_read_local_mmr
+#define read_gmmr uv_read_global_mmr64
/*
* bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1
*/
-#define DESC_STATUS_IDLE 0
-#define DESC_STATUS_ACTIVE 1
-#define DESC_STATUS_DESTINATION_TIMEOUT 2
-#define DESC_STATUS_SOURCE_TIMEOUT 3
+#define DS_IDLE 0
+#define DS_ACTIVE 1
+#define DS_DESTINATION_TIMEOUT 2
+#define DS_SOURCE_TIMEOUT 3
+/*
+ * bits put together from HRP_LB_BAU_SB_ACTIVATION_STATUS_0/1/2
+ * values 1 and 3 will not occur
+ * Decoded meaning ERROR BUSY AUX ERR
+ * ------------------------------- ---- ----- -------
+ * IDLE 0 0 0
+ * BUSY (active) 0 1 0
+ * SW Ack Timeout (destination) 1 0 0
+ * SW Ack INTD rejected (strong NACK) 1 0 1
+ * Source Side Time Out Detected 1 1 0
+ * Destination Side PUT Failed 1 1 1
+ */
+#define UV2H_DESC_IDLE 0
+#define UV2H_DESC_BUSY 2
+#define UV2H_DESC_DEST_TIMEOUT 4
+#define UV2H_DESC_DEST_STRONG_NACK 5
+#define UV2H_DESC_SOURCE_TIMEOUT 6
+#define UV2H_DESC_DEST_PUT_ERR 7
/*
* delay for 'plugged' timeout retries, in microseconds
@@ -86,13 +132,24 @@
* threshholds at which to use IPI to free resources
*/
/* after this # consecutive 'plugged' timeouts, use IPI to release resources */
-#define PLUGSB4RESET 100
+#define PLUGSB4RESET 100
/* after this many consecutive timeouts, use IPI to release resources */
-#define TIMEOUTSB4RESET 1
+#define TIMEOUTSB4RESET 1
/* at this number uses of IPI to release resources, giveup the request */
-#define IPI_RESET_LIMIT 1
+#define IPI_RESET_LIMIT 1
/* after this # consecutive successes, bump up the throttle if it was lowered */
-#define COMPLETE_THRESHOLD 5
+#define COMPLETE_THRESHOLD 5
+
+#define UV_LB_SUBNODEID 0x10
+
+/* these two are the same for UV1 and UV2: */
+#define UV_SA_SHFT UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT
+#define UV_SA_MASK UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK
+/* 4 bits of software ack period */
+#define UV2_ACK_MASK 0x7UL
+#define UV2_ACK_UNITS_SHFT 3
+#define UV2_LEG_SHFT UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT
+#define UV2_EXT_SHFT UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT
/*
* number of entries in the destination side payload queue
@@ -113,9 +170,16 @@
/*
* tuning the action when the numalink network is extremely delayed
*/
-#define CONGESTED_RESPONSE_US 1000 /* 'long' response time, in microseconds */
-#define CONGESTED_REPS 10 /* long delays averaged over this many broadcasts */
-#define CONGESTED_PERIOD 30 /* time for the bau to be disabled, in seconds */
+#define CONGESTED_RESPONSE_US 1000 /* 'long' response time, in
+ microseconds */
+#define CONGESTED_REPS 10 /* long delays averaged over
+ this many broadcasts */
+#define CONGESTED_PERIOD 30 /* time for the bau to be
+ disabled, in seconds */
+/* see msg_type: */
+#define MSG_NOOP 0
+#define MSG_REGULAR 1
+#define MSG_RETRY 2
/*
* Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor)
@@ -124,11 +188,11 @@
* The distribution specification (32 bytes) is interpreted as a 256-bit
* distribution vector. Adjacent bits correspond to consecutive even numbered
* nodeIDs. The result of adding the index of a given bit to the 15-bit
- * 'base_dest_nodeid' field of the header corresponds to the
+ * 'base_dest_nasid' field of the header corresponds to the
* destination nodeID associated with that specified bit.
*/
-struct bau_target_uvhubmask {
- unsigned long bits[BITS_TO_LONGS(UV_DISTRIBUTION_SIZE)];
+struct pnmask {
+ unsigned long bits[BITS_TO_LONGS(UV_DISTRIBUTION_SIZE)];
};
/*
@@ -137,7 +201,7 @@ struct bau_target_uvhubmask {
* enough bits for max. cpu's per uvhub)
*/
struct bau_local_cpumask {
- unsigned long bits;
+ unsigned long bits;
};
/*
@@ -158,14 +222,14 @@ struct bau_local_cpumask {
* The payload is software-defined for INTD transactions
*/
struct bau_msg_payload {
- unsigned long address; /* signifies a page or all TLB's
- of the cpu */
+ unsigned long address; /* signifies a page or all
+ TLB's of the cpu */
/* 64 bits */
- unsigned short sending_cpu; /* filled in by sender */
+ unsigned short sending_cpu; /* filled in by sender */
/* 16 bits */
- unsigned short acknowledge_count;/* filled in by destination */
+ unsigned short acknowledge_count; /* filled in by destination */
/* 16 bits */
- unsigned int reserved1:32; /* not usable */
+ unsigned int reserved1:32; /* not usable */
};
@@ -174,93 +238,96 @@ struct bau_msg_payload {
* see table 4.2.3.0.1 in broacast_assist spec.
*/
struct bau_msg_header {
- unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */
+ unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */
/* bits 5:0 */
- unsigned int base_dest_nodeid:15; /* nasid of the */
- /* bits 20:6 */ /* first bit in uvhub map */
- unsigned int command:8; /* message type */
+ unsigned int base_dest_nasid:15; /* nasid of the first bit */
+ /* bits 20:6 */ /* in uvhub map */
+ unsigned int command:8; /* message type */
/* bits 28:21 */
- /* 0x38: SN3net EndPoint Message */
- unsigned int rsvd_1:3; /* must be zero */
+ /* 0x38: SN3net EndPoint Message */
+ unsigned int rsvd_1:3; /* must be zero */
/* bits 31:29 */
- /* int will align on 32 bits */
- unsigned int rsvd_2:9; /* must be zero */
+ /* int will align on 32 bits */
+ unsigned int rsvd_2:9; /* must be zero */
/* bits 40:32 */
- /* Suppl_A is 56-41 */
- unsigned int sequence:16;/* message sequence number */
- /* bits 56:41 */ /* becomes bytes 16-17 of msg */
- /* Address field (96:57) is never used as an
- address (these are address bits 42:3) */
-
- unsigned int rsvd_3:1; /* must be zero */
+ /* Suppl_A is 56-41 */
+ unsigned int sequence:16; /* message sequence number */
+ /* bits 56:41 */ /* becomes bytes 16-17 of msg */
+ /* Address field (96:57) is
+ never used as an address
+ (these are address bits
+ 42:3) */
+
+ unsigned int rsvd_3:1; /* must be zero */
/* bit 57 */
- /* address bits 27:4 are payload */
+ /* address bits 27:4 are payload */
/* these next 24 (58-81) bits become bytes 12-14 of msg */
-
/* bits 65:58 land in byte 12 */
- unsigned int replied_to:1;/* sent as 0 by the source to byte 12 */
+ unsigned int replied_to:1; /* sent as 0 by the source to
+ byte 12 */
/* bit 58 */
- unsigned int msg_type:3; /* software type of the message*/
+ unsigned int msg_type:3; /* software type of the
+ message */
/* bits 61:59 */
- unsigned int canceled:1; /* message canceled, resource to be freed*/
+ unsigned int canceled:1; /* message canceled, resource
+ is to be freed*/
/* bit 62 */
- unsigned int payload_1a:1;/* not currently used */
+ unsigned int payload_1a:1; /* not currently used */
/* bit 63 */
- unsigned int payload_1b:2;/* not currently used */
+ unsigned int payload_1b:2; /* not currently used */
/* bits 65:64 */
/* bits 73:66 land in byte 13 */
- unsigned int payload_1ca:6;/* not currently used */
+ unsigned int payload_1ca:6; /* not currently used */
/* bits 71:66 */
- unsigned int payload_1c:2;/* not currently used */
+ unsigned int payload_1c:2; /* not currently used */
/* bits 73:72 */
/* bits 81:74 land in byte 14 */
- unsigned int payload_1d:6;/* not currently used */
+ unsigned int payload_1d:6; /* not currently used */
/* bits 79:74 */
- unsigned int payload_1e:2;/* not currently used */
+ unsigned int payload_1e:2; /* not currently used */
/* bits 81:80 */
- unsigned int rsvd_4:7; /* must be zero */
+ unsigned int rsvd_4:7; /* must be zero */
/* bits 88:82 */
- unsigned int sw_ack_flag:1;/* software acknowledge flag */
+ unsigned int swack_flag:1; /* software acknowledge flag */
/* bit 89 */
- /* INTD trasactions at destination are to
- wait for software acknowledge */
- unsigned int rsvd_5:6; /* must be zero */
+ /* INTD trasactions at
+ destination are to wait for
+ software acknowledge */
+ unsigned int rsvd_5:6; /* must be zero */
/* bits 95:90 */
- unsigned int rsvd_6:5; /* must be zero */
+ unsigned int rsvd_6:5; /* must be zero */
/* bits 100:96 */
- unsigned int int_both:1;/* if 1, interrupt both sockets on the uvhub */
+ unsigned int int_both:1; /* if 1, interrupt both sockets
+ on the uvhub */
/* bit 101*/
- unsigned int fairness:3;/* usually zero */
+ unsigned int fairness:3; /* usually zero */
/* bits 104:102 */
- unsigned int multilevel:1; /* multi-level multicast format */
+ unsigned int multilevel:1; /* multi-level multicast
+ format */
/* bit 105 */
- /* 0 for TLB: endpoint multi-unicast messages */
- unsigned int chaining:1;/* next descriptor is part of this activation*/
+ /* 0 for TLB: endpoint multi-unicast messages */
+ unsigned int chaining:1; /* next descriptor is part of
+ this activation*/
/* bit 106 */
- unsigned int rsvd_7:21; /* must be zero */
+ unsigned int rsvd_7:21; /* must be zero */
/* bits 127:107 */
};
-/* see msg_type: */
-#define MSG_NOOP 0
-#define MSG_REGULAR 1
-#define MSG_RETRY 2
-
/*
* The activation descriptor:
* The format of the message to send, plus all accompanying control
* Should be 64 bytes
*/
struct bau_desc {
- struct bau_target_uvhubmask distribution;
+ struct pnmask distribution;
/*
* message template, consisting of header and payload:
*/
- struct bau_msg_header header;
- struct bau_msg_payload payload;
+ struct bau_msg_header header;
+ struct bau_msg_payload payload;
};
/*
* -payload-- ---------header------
@@ -279,59 +346,51 @@ struct bau_desc {
* are 32 bytes (2 micropackets) (256 bits) in length, but contain only 17
* bytes of usable data, including the sw ack vector in byte 15 (bits 127:120)
* (12 bytes come from bau_msg_payload, 3 from payload_1, 2 from
- * sw_ack_vector and payload_2)
+ * swack_vec and payload_2)
* "Enabling Software Acknowledgment mode (see Section 4.3.3 Software
* Acknowledge Processing) also selects 32 byte (17 bytes usable) payload
* operation."
*/
-struct bau_payload_queue_entry {
- unsigned long address; /* signifies a page or all TLB's
- of the cpu */
+struct bau_pq_entry {
+ unsigned long address; /* signifies a page or all TLB's
+ of the cpu */
/* 64 bits, bytes 0-7 */
-
- unsigned short sending_cpu; /* cpu that sent the message */
+ unsigned short sending_cpu; /* cpu that sent the message */
/* 16 bits, bytes 8-9 */
-
- unsigned short acknowledge_count; /* filled in by destination */
+ unsigned short acknowledge_count; /* filled in by destination */
/* 16 bits, bytes 10-11 */
-
/* these next 3 bytes come from bits 58-81 of the message header */
- unsigned short replied_to:1; /* sent as 0 by the source */
- unsigned short msg_type:3; /* software message type */
- unsigned short canceled:1; /* sent as 0 by the source */
- unsigned short unused1:3; /* not currently using */
+ unsigned short replied_to:1; /* sent as 0 by the source */
+ unsigned short msg_type:3; /* software message type */
+ unsigned short canceled:1; /* sent as 0 by the source */
+ unsigned short unused1:3; /* not currently using */
/* byte 12 */
-
- unsigned char unused2a; /* not currently using */
+ unsigned char unused2a; /* not currently using */
/* byte 13 */
- unsigned char unused2; /* not currently using */
+ unsigned char unused2; /* not currently using */
/* byte 14 */
-
- unsigned char sw_ack_vector; /* filled in by the hardware */
+ unsigned char swack_vec; /* filled in by the hardware */
/* byte 15 (bits 127:120) */
-
- unsigned short sequence; /* message sequence number */
+ unsigned short sequence; /* message sequence number */
/* bytes 16-17 */
- unsigned char unused4[2]; /* not currently using bytes 18-19 */
+ unsigned char unused4[2]; /* not currently using bytes 18-19 */
/* bytes 18-19 */
-
- int number_of_cpus; /* filled in at destination */
+ int number_of_cpus; /* filled in at destination */
/* 32 bits, bytes 20-23 (aligned) */
-
- unsigned char unused5[8]; /* not using */
+ unsigned char unused5[8]; /* not using */
/* bytes 24-31 */
};
struct msg_desc {
- struct bau_payload_queue_entry *msg;
- int msg_slot;
- int sw_ack_slot;
- struct bau_payload_queue_entry *va_queue_first;
- struct bau_payload_queue_entry *va_queue_last;
+ struct bau_pq_entry *msg;
+ int msg_slot;
+ int swack_slot;
+ struct bau_pq_entry *queue_first;
+ struct bau_pq_entry *queue_last;
};
struct reset_args {
- int sender;
+ int sender;
};
/*
@@ -339,105 +398,227 @@ struct reset_args {
*/
struct ptc_stats {
/* sender statistics */
- unsigned long s_giveup; /* number of fall backs to IPI-style flushes */
- unsigned long s_requestor; /* number of shootdown requests */
- unsigned long s_stimeout; /* source side timeouts */
- unsigned long s_dtimeout; /* destination side timeouts */
- unsigned long s_time; /* time spent in sending side */
- unsigned long s_retriesok; /* successful retries */
- unsigned long s_ntargcpu; /* total number of cpu's targeted */
- unsigned long s_ntargself; /* times the sending cpu was targeted */
- unsigned long s_ntarglocals; /* targets of cpus on the local blade */
- unsigned long s_ntargremotes; /* targets of cpus on remote blades */
- unsigned long s_ntarglocaluvhub; /* targets of the local hub */
- unsigned long s_ntargremoteuvhub; /* remotes hubs targeted */
- unsigned long s_ntarguvhub; /* total number of uvhubs targeted */
- unsigned long s_ntarguvhub16; /* number of times target hubs >= 16*/
- unsigned long s_ntarguvhub8; /* number of times target hubs >= 8 */
- unsigned long s_ntarguvhub4; /* number of times target hubs >= 4 */
- unsigned long s_ntarguvhub2; /* number of times target hubs >= 2 */
- unsigned long s_ntarguvhub1; /* number of times target hubs == 1 */
- unsigned long s_resets_plug; /* ipi-style resets from plug state */
- unsigned long s_resets_timeout; /* ipi-style resets from timeouts */
- unsigned long s_busy; /* status stayed busy past s/w timer */
- unsigned long s_throttles; /* waits in throttle */
- unsigned long s_retry_messages; /* retry broadcasts */
- unsigned long s_bau_reenabled; /* for bau enable/disable */
- unsigned long s_bau_disabled; /* for bau enable/disable */
+ unsigned long s_giveup; /* number of fall backs to
+ IPI-style flushes */
+ unsigned long s_requestor; /* number of shootdown
+ requests */
+ unsigned long s_stimeout; /* source side timeouts */
+ unsigned long s_dtimeout; /* destination side timeouts */
+ unsigned long s_time; /* time spent in sending side */
+ unsigned long s_retriesok; /* successful retries */
+ unsigned long s_ntargcpu; /* total number of cpu's
+ targeted */
+ unsigned long s_ntargself; /* times the sending cpu was
+ targeted */
+ unsigned long s_ntarglocals; /* targets of cpus on the local
+ blade */
+ unsigned long s_ntargremotes; /* targets of cpus on remote
+ blades */
+ unsigned long s_ntarglocaluvhub; /* targets of the local hub */
+ unsigned long s_ntargremoteuvhub; /* remotes hubs targeted */
+ unsigned long s_ntarguvhub; /* total number of uvhubs
+ targeted */
+ unsigned long s_ntarguvhub16; /* number of times target
+ hubs >= 16*/
+ unsigned long s_ntarguvhub8; /* number of times target
+ hubs >= 8 */
+ unsigned long s_ntarguvhub4; /* number of times target
+ hubs >= 4 */
+ unsigned long s_ntarguvhub2; /* number of times target
+ hubs >= 2 */
+ unsigned long s_ntarguvhub1; /* number of times target
+ hubs == 1 */
+ unsigned long s_resets_plug; /* ipi-style resets from plug
+ state */
+ unsigned long s_resets_timeout; /* ipi-style resets from
+ timeouts */
+ unsigned long s_busy; /* status stayed busy past
+ s/w timer */
+ unsigned long s_throttles; /* waits in throttle */
+ unsigned long s_retry_messages; /* retry broadcasts */
+ unsigned long s_bau_reenabled; /* for bau enable/disable */
+ unsigned long s_bau_disabled; /* for bau enable/disable */
/* destination statistics */
- unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */
- unsigned long d_onetlb; /* times just one tlb on this cpu was flushed */
- unsigned long d_multmsg; /* interrupts with multiple messages */
- unsigned long d_nomsg; /* interrupts with no message */
- unsigned long d_time; /* time spent on destination side */
- unsigned long d_requestee; /* number of messages processed */
- unsigned long d_retries; /* number of retry messages processed */
- unsigned long d_canceled; /* number of messages canceled by retries */
- unsigned long d_nocanceled; /* retries that found nothing to cancel */
- unsigned long d_resets; /* number of ipi-style requests processed */
- unsigned long d_rcanceled; /* number of messages canceled by resets */
+ unsigned long d_alltlb; /* times all tlb's on this
+ cpu were flushed */
+ unsigned long d_onetlb; /* times just one tlb on this
+ cpu was flushed */
+ unsigned long d_multmsg; /* interrupts with multiple
+ messages */
+ unsigned long d_nomsg; /* interrupts with no message */
+ unsigned long d_time; /* time spent on destination
+ side */
+ unsigned long d_requestee; /* number of messages
+ processed */
+ unsigned long d_retries; /* number of retry messages
+ processed */
+ unsigned long d_canceled; /* number of messages canceled
+ by retries */
+ unsigned long d_nocanceled; /* retries that found nothing
+ to cancel */
+ unsigned long d_resets; /* number of ipi-style requests
+ processed */
+ unsigned long d_rcanceled; /* number of messages canceled
+ by resets */
+};
+
+struct tunables {
+ int *tunp;
+ int deflt;
+};
+
+struct hub_and_pnode {
+ short uvhub;
+ short pnode;
+};
+
+struct socket_desc {
+ short num_cpus;
+ short cpu_number[MAX_CPUS_PER_SOCKET];
+};
+
+struct uvhub_desc {
+ unsigned short socket_mask;
+ short num_cpus;
+ short uvhub;
+ short pnode;
+ struct socket_desc socket[2];
};
/*
* one per-cpu; to locate the software tables
*/
struct bau_control {
- struct bau_desc *descriptor_base;
- struct bau_payload_queue_entry *va_queue_first;
- struct bau_payload_queue_entry *va_queue_last;
- struct bau_payload_queue_entry *bau_msg_head;
- struct bau_control *uvhub_master;
- struct bau_control *socket_master;
- struct ptc_stats *statp;
- unsigned long timeout_interval;
- unsigned long set_bau_on_time;
- atomic_t active_descriptor_count;
- int plugged_tries;
- int timeout_tries;
- int ipi_attempts;
- int conseccompletes;
- int baudisabled;
- int set_bau_off;
- short cpu;
- short uvhub_cpu;
- short uvhub;
- short cpus_in_socket;
- short cpus_in_uvhub;
- unsigned short message_number;
- unsigned short uvhub_quiesce;
- short socket_acknowledge_count[DEST_Q_SIZE];
- cycles_t send_message;
- spinlock_t uvhub_lock;
- spinlock_t queue_lock;
+ struct bau_desc *descriptor_base;
+ struct bau_pq_entry *queue_first;
+ struct bau_pq_entry *queue_last;
+ struct bau_pq_entry *bau_msg_head;
+ struct bau_control *uvhub_master;
+ struct bau_control *socket_master;
+ struct ptc_stats *statp;
+ cpumask_t *cpumask;
+ unsigned long timeout_interval;
+ unsigned long set_bau_on_time;
+ atomic_t active_descriptor_count;
+ int plugged_tries;
+ int timeout_tries;
+ int ipi_attempts;
+ int conseccompletes;
+ int baudisabled;
+ int set_bau_off;
+ short cpu;
+ short osnode;
+ short uvhub_cpu;
+ short uvhub;
+ short cpus_in_socket;
+ short cpus_in_uvhub;
+ short partition_base_pnode;
+ unsigned short message_number;
+ unsigned short uvhub_quiesce;
+ short socket_acknowledge_count[DEST_Q_SIZE];
+ cycles_t send_message;
+ spinlock_t uvhub_lock;
+ spinlock_t queue_lock;
/* tunables */
- int max_bau_concurrent;
- int max_bau_concurrent_constant;
- int plugged_delay;
- int plugsb4reset;
- int timeoutsb4reset;
- int ipi_reset_limit;
- int complete_threshold;
- int congested_response_us;
- int congested_reps;
- int congested_period;
- cycles_t period_time;
- long period_requests;
+ int max_concurr;
+ int max_concurr_const;
+ int plugged_delay;
+ int plugsb4reset;
+ int timeoutsb4reset;
+ int ipi_reset_limit;
+ int complete_threshold;
+ int cong_response_us;
+ int cong_reps;
+ int cong_period;
+ cycles_t period_time;
+ long period_requests;
+ struct hub_and_pnode *thp;
};
-static inline int bau_uvhub_isset(int uvhub, struct bau_target_uvhubmask *dstp)
+static inline unsigned long read_mmr_uv2_status(void)
+{
+ return read_lmmr(UV2H_LB_BAU_SB_ACTIVATION_STATUS_2);
+}
+
+static inline void write_mmr_data_broadcast(int pnode, unsigned long mmr_image)
+{
+ write_gmmr(pnode, UVH_BAU_DATA_BROADCAST, mmr_image);
+}
+
+static inline void write_mmr_descriptor_base(int pnode, unsigned long mmr_image)
+{
+ write_gmmr(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, mmr_image);
+}
+
+static inline void write_mmr_activation(unsigned long index)
+{
+ write_lmmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
+}
+
+static inline void write_gmmr_activation(int pnode, unsigned long mmr_image)
+{
+ write_gmmr(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL, mmr_image);
+}
+
+static inline void write_mmr_payload_first(int pnode, unsigned long mmr_image)
+{
+ write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, mmr_image);
+}
+
+static inline void write_mmr_payload_tail(int pnode, unsigned long mmr_image)
+{
+ write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL, mmr_image);
+}
+
+static inline void write_mmr_payload_last(int pnode, unsigned long mmr_image)
+{
+ write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST, mmr_image);
+}
+
+static inline void write_mmr_misc_control(int pnode, unsigned long mmr_image)
+{
+ write_gmmr(pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
+}
+
+static inline unsigned long read_mmr_misc_control(int pnode)
+{
+ return read_gmmr(pnode, UVH_LB_BAU_MISC_CONTROL);
+}
+
+static inline void write_mmr_sw_ack(unsigned long mr)
+{
+ uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr);
+}
+
+static inline unsigned long read_mmr_sw_ack(void)
+{
+ return read_lmmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
+}
+
+static inline unsigned long read_gmmr_sw_ack(int pnode)
+{
+ return read_gmmr(pnode, UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
+}
+
+static inline void write_mmr_data_config(int pnode, unsigned long mr)
+{
+ uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, mr);
+}
+
+static inline int bau_uvhub_isset(int uvhub, struct pnmask *dstp)
{
return constant_test_bit(uvhub, &dstp->bits[0]);
}
-static inline void bau_uvhub_set(int uvhub, struct bau_target_uvhubmask *dstp)
+static inline void bau_uvhub_set(int pnode, struct pnmask *dstp)
{
- __set_bit(uvhub, &dstp->bits[0]);
+ __set_bit(pnode, &dstp->bits[0]);
}
-static inline void bau_uvhubs_clear(struct bau_target_uvhubmask *dstp,
+static inline void bau_uvhubs_clear(struct pnmask *dstp,
int nbits)
{
bitmap_zero(&dstp->bits[0], nbits);
}
-static inline int bau_uvhub_weight(struct bau_target_uvhubmask *dstp)
+static inline int bau_uvhub_weight(struct pnmask *dstp)
{
return bitmap_weight((unsigned long *)&dstp->bits[0],
UV_DISTRIBUTION_SIZE);
@@ -448,9 +629,6 @@ static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits)
bitmap_zero(&dstp->bits, nbits);
}
-#define cpubit_isset(cpu, bau_local_cpumask) \
- test_bit((cpu), (bau_local_cpumask).bits)
-
extern void uv_bau_message_intr1(void);
extern void uv_bau_timeout_intr1(void);
@@ -458,7 +636,7 @@ struct atomic_short {
short counter;
};
-/**
+/*
* atomic_read_short - read a short atomic variable
* @v: pointer of type atomic_short
*
@@ -469,14 +647,14 @@ static inline int atomic_read_short(const struct atomic_short *v)
return v->counter;
}
-/**
- * atomic_add_short_return - add and return a short int
+/*
+ * atom_asr - add and return a short int
* @i: short value to add
* @v: pointer of type atomic_short
*
* Atomically adds @i to @v and returns @i + @v
*/
-static inline int atomic_add_short_return(short i, struct atomic_short *v)
+static inline int atom_asr(short i, struct atomic_short *v)
{
short __i = i;
asm volatile(LOCK_PREFIX "xaddw %0, %1"
@@ -485,4 +663,26 @@ static inline int atomic_add_short_return(short i, struct atomic_short *v)
return i + __i;
}
+/*
+ * conditionally add 1 to *v, unless *v is >= u
+ * return 0 if we cannot add 1 to *v because it is >= u
+ * return 1 if we can add 1 to *v because it is < u
+ * the add is atomic
+ *
+ * This is close to atomic_add_unless(), but this allows the 'u' value
+ * to be lowered below the current 'v'. atomic_add_unless can only stop
+ * on equal.
+ */
+static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
+{
+ spin_lock(lock);
+ if (atomic_read(v) >= u) {
+ spin_unlock(lock);
+ return 0;
+ }
+ atomic_inc(v);
+ spin_unlock(lock);
+ return 1;
+}
+
#endif /* _ASM_X86_UV_UV_BAU_H */
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index a501741c233..f26544a1521 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -77,8 +77,9 @@
*
* 1111110000000000
* 5432109876543210
- * pppppppppplc0cch Nehalem-EX
- * ppppppppplcc0cch Westmere-EX
+ * pppppppppplc0cch Nehalem-EX (12 bits in hdw reg)
+ * ppppppppplcc0cch Westmere-EX (12 bits in hdw reg)
+ * pppppppppppcccch SandyBridge (15 bits in hdw reg)
* sssssssssss
*
* p = pnode bits
@@ -87,7 +88,7 @@
* h = hyperthread
* s = bits that are in the SOCKET_ID CSR
*
- * Note: Processor only supports 12 bits in the APICID register. The ACPI
+ * Note: Processor may support fewer bits in the APICID register. The ACPI
* tables hold all 16 bits. Software needs to be aware of this.
*
* Unless otherwise specified, all references to APICID refer to
@@ -138,6 +139,8 @@ struct uv_hub_info_s {
unsigned long global_mmr_base;
unsigned long gpa_mask;
unsigned int gnode_extra;
+ unsigned char hub_revision;
+ unsigned char apic_pnode_shift;
unsigned long gnode_upper;
unsigned long lowmem_remap_top;
unsigned long lowmem_remap_base;
@@ -149,13 +152,31 @@ struct uv_hub_info_s {
unsigned char m_val;
unsigned char n_val;
struct uv_scir_s scir;
- unsigned char apic_pnode_shift;
};
DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
#define uv_hub_info (&__get_cpu_var(__uv_hub_info))
#define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu))
+/*
+ * Hub revisions less than UV2_HUB_REVISION_BASE are UV1 hubs. All UV2
+ * hubs have revision numbers greater than or equal to UV2_HUB_REVISION_BASE.
+ * This is a software convention - NOT the hardware revision numbers in
+ * the hub chip.
+ */
+#define UV1_HUB_REVISION_BASE 1
+#define UV2_HUB_REVISION_BASE 3
+
+static inline int is_uv1_hub(void)
+{
+ return uv_hub_info->hub_revision < UV2_HUB_REVISION_BASE;
+}
+
+static inline int is_uv2_hub(void)
+{
+ return uv_hub_info->hub_revision >= UV2_HUB_REVISION_BASE;
+}
+
union uvh_apicid {
unsigned long v;
struct uvh_apicid_s {
@@ -180,11 +201,25 @@ union uvh_apicid {
#define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra)
#define UV_PNODE_TO_NASID(p) (UV_PNODE_TO_GNODE(p) << 1)
-#define UV_LOCAL_MMR_BASE 0xf4000000UL
-#define UV_GLOBAL_MMR32_BASE 0xf8000000UL
+#define UV1_LOCAL_MMR_BASE 0xf4000000UL
+#define UV1_GLOBAL_MMR32_BASE 0xf8000000UL
+#define UV1_LOCAL_MMR_SIZE (64UL * 1024 * 1024)
+#define UV1_GLOBAL_MMR32_SIZE (64UL * 1024 * 1024)
+
+#define UV2_LOCAL_MMR_BASE 0xfa000000UL
+#define UV2_GLOBAL_MMR32_BASE 0xfc000000UL
+#define UV2_LOCAL_MMR_SIZE (32UL * 1024 * 1024)
+#define UV2_GLOBAL_MMR32_SIZE (32UL * 1024 * 1024)
+
+#define UV_LOCAL_MMR_BASE (is_uv1_hub() ? UV1_LOCAL_MMR_BASE \
+ : UV2_LOCAL_MMR_BASE)
+#define UV_GLOBAL_MMR32_BASE (is_uv1_hub() ? UV1_GLOBAL_MMR32_BASE \
+ : UV2_GLOBAL_MMR32_BASE)
+#define UV_LOCAL_MMR_SIZE (is_uv1_hub() ? UV1_LOCAL_MMR_SIZE : \
+ UV2_LOCAL_MMR_SIZE)
+#define UV_GLOBAL_MMR32_SIZE (is_uv1_hub() ? UV1_GLOBAL_MMR32_SIZE :\
+ UV2_GLOBAL_MMR32_SIZE)
#define UV_GLOBAL_MMR64_BASE (uv_hub_info->global_mmr_base)
-#define UV_LOCAL_MMR_SIZE (64UL * 1024 * 1024)
-#define UV_GLOBAL_MMR32_SIZE (64UL * 1024 * 1024)
#define UV_GLOBAL_GRU_MMR_BASE 0x4000000
@@ -301,6 +336,17 @@ static inline int uv_apicid_to_pnode(int apicid)
}
/*
+ * Convert an apicid to the socket number on the blade
+ */
+static inline int uv_apicid_to_socket(int apicid)
+{
+ if (is_uv1_hub())
+ return (apicid >> (uv_hub_info->apic_pnode_shift - 1)) & 1;
+ else
+ return 0;
+}
+
+/*
* Access global MMRs using the low memory MMR32 space. This region supports
* faster MMR access but not all MMRs are accessible in this space.
*/
@@ -398,6 +444,8 @@ struct uv_blade_info {
unsigned short nr_online_cpus;
unsigned short pnode;
short memory_nid;
+ spinlock_t nmi_lock;
+ unsigned long nmi_count;
};
extern struct uv_blade_info *uv_blade_info;
extern short *uv_node_to_blade;
@@ -517,14 +565,13 @@ static inline void uv_hub_send_ipi(int pnode, int apicid, int vector)
/*
* Get the minimum revision number of the hub chips within the partition.
- * 1 - initial rev 1.0 silicon
- * 2 - rev 2.0 production silicon
+ * 1 - UV1 rev 1.0 initial silicon
+ * 2 - UV1 rev 2.0 production silicon
+ * 3 - UV2 rev 1.0 initial silicon
*/
static inline int uv_get_min_hub_revision_id(void)
{
- extern int uv_min_hub_revision_id;
-
- return uv_min_hub_revision_id;
+ return uv_hub_info->hub_revision;
}
#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
index 20cafeac745..10474fb1185 100644
--- a/arch/x86/include/asm/uv/uv_mmrs.h
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -5,1099 +5,2072 @@
*
* SGI UV MMR definitions
*
- * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2007-2011 Silicon Graphics, Inc. All rights reserved.
*/
#ifndef _ASM_X86_UV_UV_MMRS_H
#define _ASM_X86_UV_UV_MMRS_H
+/*
+ * This file contains MMR definitions for both UV1 & UV2 hubs.
+ *
+ * In general, MMR addresses and structures are identical on both hubs.
+ * These MMRs are identified as:
+ * #define UVH_xxx <address>
+ * union uvh_xxx {
+ * unsigned long v;
+ * struct uvh_int_cmpd_s {
+ * } s;
+ * };
+ *
+ * If the MMR exists on both hub type but has different addresses or
+ * contents, the MMR definition is similar to:
+ * #define UV1H_xxx <uv1 address>
+ * #define UV2H_xxx <uv2address>
+ * #define UVH_xxx (is_uv1_hub() ? UV1H_xxx : UV2H_xxx)
+ * union uvh_xxx {
+ * unsigned long v;
+ * struct uv1h_int_cmpd_s { (Common fields only)
+ * } s;
+ * struct uv1h_int_cmpd_s { (Full UV1 definition)
+ * } s1;
+ * struct uv2h_int_cmpd_s { (Full UV2 definition)
+ * } s2;
+ * };
+ *
+ * Only essential difference are enumerated. For example, if the address is
+ * the same for both UV1 & UV2, only a single #define is generated. Likewise,
+ * if the contents is the same for both hubs, only the "s" structure is
+ * generated.
+ *
+ * If the MMR exists on ONLY 1 type of hub, no generic definition is
+ * generated:
+ * #define UVnH_xxx <uvn address>
+ * union uvnh_xxx {
+ * unsigned long v;
+ * struct uvh_int_cmpd_s {
+ * } sn;
+ * };
+ */
+
#define UV_MMR_ENABLE (1UL << 63)
+#define UV1_HUB_PART_NUMBER 0x88a5
+#define UV2_HUB_PART_NUMBER 0x8eb8
+
+/* Compat: if this #define is present, UV headers support UV2 */
+#define UV2_HUB_IS_SUPPORTED 1
+
/* ========================================================================= */
/* UVH_BAU_DATA_BROADCAST */
/* ========================================================================= */
-#define UVH_BAU_DATA_BROADCAST 0x61688UL
-#define UVH_BAU_DATA_BROADCAST_32 0x0440
+#define UVH_BAU_DATA_BROADCAST 0x61688UL
+#define UVH_BAU_DATA_BROADCAST_32 0x440
-#define UVH_BAU_DATA_BROADCAST_ENABLE_SHFT 0
-#define UVH_BAU_DATA_BROADCAST_ENABLE_MASK 0x0000000000000001UL
+#define UVH_BAU_DATA_BROADCAST_ENABLE_SHFT 0
+#define UVH_BAU_DATA_BROADCAST_ENABLE_MASK 0x0000000000000001UL
union uvh_bau_data_broadcast_u {
- unsigned long v;
- struct uvh_bau_data_broadcast_s {
- unsigned long enable : 1; /* RW */
- unsigned long rsvd_1_63: 63; /* */
- } s;
+ unsigned long v;
+ struct uvh_bau_data_broadcast_s {
+ unsigned long enable:1; /* RW */
+ unsigned long rsvd_1_63:63;
+ } s;
};
/* ========================================================================= */
/* UVH_BAU_DATA_CONFIG */
/* ========================================================================= */
-#define UVH_BAU_DATA_CONFIG 0x61680UL
-#define UVH_BAU_DATA_CONFIG_32 0x0438
-
-#define UVH_BAU_DATA_CONFIG_VECTOR_SHFT 0
-#define UVH_BAU_DATA_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_BAU_DATA_CONFIG_DM_SHFT 8
-#define UVH_BAU_DATA_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_BAU_DATA_CONFIG_DESTMODE_SHFT 11
-#define UVH_BAU_DATA_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_BAU_DATA_CONFIG_STATUS_SHFT 12
-#define UVH_BAU_DATA_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_BAU_DATA_CONFIG_P_SHFT 13
-#define UVH_BAU_DATA_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_BAU_DATA_CONFIG_T_SHFT 15
-#define UVH_BAU_DATA_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_BAU_DATA_CONFIG_M_SHFT 16
-#define UVH_BAU_DATA_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_BAU_DATA_CONFIG_APIC_ID_SHFT 32
-#define UVH_BAU_DATA_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+#define UVH_BAU_DATA_CONFIG 0x61680UL
+#define UVH_BAU_DATA_CONFIG_32 0x438
+
+#define UVH_BAU_DATA_CONFIG_VECTOR_SHFT 0
+#define UVH_BAU_DATA_CONFIG_DM_SHFT 8
+#define UVH_BAU_DATA_CONFIG_DESTMODE_SHFT 11
+#define UVH_BAU_DATA_CONFIG_STATUS_SHFT 12
+#define UVH_BAU_DATA_CONFIG_P_SHFT 13
+#define UVH_BAU_DATA_CONFIG_T_SHFT 15
+#define UVH_BAU_DATA_CONFIG_M_SHFT 16
+#define UVH_BAU_DATA_CONFIG_APIC_ID_SHFT 32
+#define UVH_BAU_DATA_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_BAU_DATA_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_BAU_DATA_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_BAU_DATA_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_BAU_DATA_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_BAU_DATA_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_BAU_DATA_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_BAU_DATA_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
union uvh_bau_data_config_u {
- unsigned long v;
- struct uvh_bau_data_config_s {
- unsigned long vector_ : 8; /* RW */
- unsigned long dm : 3; /* RW */
- unsigned long destmode : 1; /* RW */
- unsigned long status : 1; /* RO */
- unsigned long p : 1; /* RO */
- unsigned long rsvd_14 : 1; /* */
- unsigned long t : 1; /* RO */
- unsigned long m : 1; /* RW */
- unsigned long rsvd_17_31: 15; /* */
- unsigned long apic_id : 32; /* RW */
- } s;
+ unsigned long v;
+ struct uvh_bau_data_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s;
};
/* ========================================================================= */
/* UVH_EVENT_OCCURRED0 */
/* ========================================================================= */
-#define UVH_EVENT_OCCURRED0 0x70000UL
-#define UVH_EVENT_OCCURRED0_32 0x005e8
-
-#define UVH_EVENT_OCCURRED0_LB_HCERR_SHFT 0
-#define UVH_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL
-#define UVH_EVENT_OCCURRED0_GR0_HCERR_SHFT 1
-#define UVH_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000002UL
-#define UVH_EVENT_OCCURRED0_GR1_HCERR_SHFT 2
-#define UVH_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000004UL
-#define UVH_EVENT_OCCURRED0_LH_HCERR_SHFT 3
-#define UVH_EVENT_OCCURRED0_LH_HCERR_MASK 0x0000000000000008UL
-#define UVH_EVENT_OCCURRED0_RH_HCERR_SHFT 4
-#define UVH_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000010UL
-#define UVH_EVENT_OCCURRED0_XN_HCERR_SHFT 5
-#define UVH_EVENT_OCCURRED0_XN_HCERR_MASK 0x0000000000000020UL
-#define UVH_EVENT_OCCURRED0_SI_HCERR_SHFT 6
-#define UVH_EVENT_OCCURRED0_SI_HCERR_MASK 0x0000000000000040UL
-#define UVH_EVENT_OCCURRED0_LB_AOERR0_SHFT 7
-#define UVH_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000080UL
-#define UVH_EVENT_OCCURRED0_GR0_AOERR0_SHFT 8
-#define UVH_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000000100UL
-#define UVH_EVENT_OCCURRED0_GR1_AOERR0_SHFT 9
-#define UVH_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000000200UL
-#define UVH_EVENT_OCCURRED0_LH_AOERR0_SHFT 10
-#define UVH_EVENT_OCCURRED0_LH_AOERR0_MASK 0x0000000000000400UL
-#define UVH_EVENT_OCCURRED0_RH_AOERR0_SHFT 11
-#define UVH_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL
-#define UVH_EVENT_OCCURRED0_XN_AOERR0_SHFT 12
-#define UVH_EVENT_OCCURRED0_XN_AOERR0_MASK 0x0000000000001000UL
-#define UVH_EVENT_OCCURRED0_SI_AOERR0_SHFT 13
-#define UVH_EVENT_OCCURRED0_SI_AOERR0_MASK 0x0000000000002000UL
-#define UVH_EVENT_OCCURRED0_LB_AOERR1_SHFT 14
-#define UVH_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000004000UL
-#define UVH_EVENT_OCCURRED0_GR0_AOERR1_SHFT 15
-#define UVH_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000000008000UL
-#define UVH_EVENT_OCCURRED0_GR1_AOERR1_SHFT 16
-#define UVH_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000000010000UL
-#define UVH_EVENT_OCCURRED0_LH_AOERR1_SHFT 17
-#define UVH_EVENT_OCCURRED0_LH_AOERR1_MASK 0x0000000000020000UL
-#define UVH_EVENT_OCCURRED0_RH_AOERR1_SHFT 18
-#define UVH_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000040000UL
-#define UVH_EVENT_OCCURRED0_XN_AOERR1_SHFT 19
-#define UVH_EVENT_OCCURRED0_XN_AOERR1_MASK 0x0000000000080000UL
-#define UVH_EVENT_OCCURRED0_SI_AOERR1_SHFT 20
-#define UVH_EVENT_OCCURRED0_SI_AOERR1_MASK 0x0000000000100000UL
-#define UVH_EVENT_OCCURRED0_RH_VPI_INT_SHFT 21
-#define UVH_EVENT_OCCURRED0_RH_VPI_INT_MASK 0x0000000000200000UL
-#define UVH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 22
-#define UVH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 23
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000000800000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 24
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000001000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 25
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000002000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 26
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000004000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 27
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000000008000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 28
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000000010000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 29
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000000020000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 30
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000000040000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 31
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000000080000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 32
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000000100000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 33
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000000200000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 34
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000000400000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 35
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000000800000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 36
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000001000000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 37
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000002000000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 38
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000004000000000UL
-#define UVH_EVENT_OCCURRED0_L1_NMI_INT_SHFT 39
-#define UVH_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0000008000000000UL
-#define UVH_EVENT_OCCURRED0_STOP_CLOCK_SHFT 40
-#define UVH_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0000010000000000UL
-#define UVH_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 41
-#define UVH_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0000020000000000UL
-#define UVH_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 42
-#define UVH_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0000040000000000UL
-#define UVH_EVENT_OCCURRED0_LTC_INT_SHFT 43
-#define UVH_EVENT_OCCURRED0_LTC_INT_MASK 0x0000080000000000UL
-#define UVH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 44
-#define UVH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL
-#define UVH_EVENT_OCCURRED0_IPI_INT_SHFT 45
-#define UVH_EVENT_OCCURRED0_IPI_INT_MASK 0x0000200000000000UL
-#define UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT 46
-#define UVH_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0000400000000000UL
-#define UVH_EVENT_OCCURRED0_EXTIO_INT1_SHFT 47
-#define UVH_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0000800000000000UL
-#define UVH_EVENT_OCCURRED0_EXTIO_INT2_SHFT 48
-#define UVH_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0001000000000000UL
-#define UVH_EVENT_OCCURRED0_EXTIO_INT3_SHFT 49
-#define UVH_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0002000000000000UL
-#define UVH_EVENT_OCCURRED0_PROFILE_INT_SHFT 50
-#define UVH_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0004000000000000UL
-#define UVH_EVENT_OCCURRED0_RTC0_SHFT 51
-#define UVH_EVENT_OCCURRED0_RTC0_MASK 0x0008000000000000UL
-#define UVH_EVENT_OCCURRED0_RTC1_SHFT 52
-#define UVH_EVENT_OCCURRED0_RTC1_MASK 0x0010000000000000UL
-#define UVH_EVENT_OCCURRED0_RTC2_SHFT 53
-#define UVH_EVENT_OCCURRED0_RTC2_MASK 0x0020000000000000UL
-#define UVH_EVENT_OCCURRED0_RTC3_SHFT 54
-#define UVH_EVENT_OCCURRED0_RTC3_MASK 0x0040000000000000UL
-#define UVH_EVENT_OCCURRED0_BAU_DATA_SHFT 55
-#define UVH_EVENT_OCCURRED0_BAU_DATA_MASK 0x0080000000000000UL
-#define UVH_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_SHFT 56
-#define UVH_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK 0x0100000000000000UL
+#define UVH_EVENT_OCCURRED0 0x70000UL
+#define UVH_EVENT_OCCURRED0_32 0x5e8
+
+#define UV1H_EVENT_OCCURRED0_LB_HCERR_SHFT 0
+#define UV1H_EVENT_OCCURRED0_GR0_HCERR_SHFT 1
+#define UV1H_EVENT_OCCURRED0_GR1_HCERR_SHFT 2
+#define UV1H_EVENT_OCCURRED0_LH_HCERR_SHFT 3
+#define UV1H_EVENT_OCCURRED0_RH_HCERR_SHFT 4
+#define UV1H_EVENT_OCCURRED0_XN_HCERR_SHFT 5
+#define UV1H_EVENT_OCCURRED0_SI_HCERR_SHFT 6
+#define UV1H_EVENT_OCCURRED0_LB_AOERR0_SHFT 7
+#define UV1H_EVENT_OCCURRED0_GR0_AOERR0_SHFT 8
+#define UV1H_EVENT_OCCURRED0_GR1_AOERR0_SHFT 9
+#define UV1H_EVENT_OCCURRED0_LH_AOERR0_SHFT 10
+#define UV1H_EVENT_OCCURRED0_RH_AOERR0_SHFT 11
+#define UV1H_EVENT_OCCURRED0_XN_AOERR0_SHFT 12
+#define UV1H_EVENT_OCCURRED0_SI_AOERR0_SHFT 13
+#define UV1H_EVENT_OCCURRED0_LB_AOERR1_SHFT 14
+#define UV1H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 15
+#define UV1H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 16
+#define UV1H_EVENT_OCCURRED0_LH_AOERR1_SHFT 17
+#define UV1H_EVENT_OCCURRED0_RH_AOERR1_SHFT 18
+#define UV1H_EVENT_OCCURRED0_XN_AOERR1_SHFT 19
+#define UV1H_EVENT_OCCURRED0_SI_AOERR1_SHFT 20
+#define UV1H_EVENT_OCCURRED0_RH_VPI_INT_SHFT 21
+#define UV1H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 22
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 23
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 24
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 25
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 26
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 27
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 28
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 29
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 30
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 31
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 32
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 33
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 34
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 35
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 36
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 37
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 38
+#define UV1H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 39
+#define UV1H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 40
+#define UV1H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 41
+#define UV1H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 42
+#define UV1H_EVENT_OCCURRED0_LTC_INT_SHFT 43
+#define UV1H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 44
+#define UV1H_EVENT_OCCURRED0_IPI_INT_SHFT 45
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 46
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 47
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 48
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 49
+#define UV1H_EVENT_OCCURRED0_PROFILE_INT_SHFT 50
+#define UV1H_EVENT_OCCURRED0_RTC0_SHFT 51
+#define UV1H_EVENT_OCCURRED0_RTC1_SHFT 52
+#define UV1H_EVENT_OCCURRED0_RTC2_SHFT 53
+#define UV1H_EVENT_OCCURRED0_RTC3_SHFT 54
+#define UV1H_EVENT_OCCURRED0_BAU_DATA_SHFT 55
+#define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_SHFT 56
+#define UV1H_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL
+#define UV1H_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000002UL
+#define UV1H_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000004UL
+#define UV1H_EVENT_OCCURRED0_LH_HCERR_MASK 0x0000000000000008UL
+#define UV1H_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000010UL
+#define UV1H_EVENT_OCCURRED0_XN_HCERR_MASK 0x0000000000000020UL
+#define UV1H_EVENT_OCCURRED0_SI_HCERR_MASK 0x0000000000000040UL
+#define UV1H_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000080UL
+#define UV1H_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000000100UL
+#define UV1H_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000000200UL
+#define UV1H_EVENT_OCCURRED0_LH_AOERR0_MASK 0x0000000000000400UL
+#define UV1H_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL
+#define UV1H_EVENT_OCCURRED0_XN_AOERR0_MASK 0x0000000000001000UL
+#define UV1H_EVENT_OCCURRED0_SI_AOERR0_MASK 0x0000000000002000UL
+#define UV1H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000004000UL
+#define UV1H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000000008000UL
+#define UV1H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000000010000UL
+#define UV1H_EVENT_OCCURRED0_LH_AOERR1_MASK 0x0000000000020000UL
+#define UV1H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000040000UL
+#define UV1H_EVENT_OCCURRED0_XN_AOERR1_MASK 0x0000000000080000UL
+#define UV1H_EVENT_OCCURRED0_SI_AOERR1_MASK 0x0000000000100000UL
+#define UV1H_EVENT_OCCURRED0_RH_VPI_INT_MASK 0x0000000000200000UL
+#define UV1H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000000800000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000001000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000002000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000004000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000000008000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000000010000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000000020000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000000040000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000000080000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000000100000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000000200000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000000400000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000000800000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000001000000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000002000000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000004000000000UL
+#define UV1H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0000008000000000UL
+#define UV1H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0000010000000000UL
+#define UV1H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0000020000000000UL
+#define UV1H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0000040000000000UL
+#define UV1H_EVENT_OCCURRED0_LTC_INT_MASK 0x0000080000000000UL
+#define UV1H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL
+#define UV1H_EVENT_OCCURRED0_IPI_INT_MASK 0x0000200000000000UL
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0000400000000000UL
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0000800000000000UL
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0001000000000000UL
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0002000000000000UL
+#define UV1H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0004000000000000UL
+#define UV1H_EVENT_OCCURRED0_RTC0_MASK 0x0008000000000000UL
+#define UV1H_EVENT_OCCURRED0_RTC1_MASK 0x0010000000000000UL
+#define UV1H_EVENT_OCCURRED0_RTC2_MASK 0x0020000000000000UL
+#define UV1H_EVENT_OCCURRED0_RTC3_MASK 0x0040000000000000UL
+#define UV1H_EVENT_OCCURRED0_BAU_DATA_MASK 0x0080000000000000UL
+#define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK 0x0100000000000000UL
+
+#define UV2H_EVENT_OCCURRED0_LB_HCERR_SHFT 0
+#define UV2H_EVENT_OCCURRED0_QP_HCERR_SHFT 1
+#define UV2H_EVENT_OCCURRED0_RH_HCERR_SHFT 2
+#define UV2H_EVENT_OCCURRED0_LH0_HCERR_SHFT 3
+#define UV2H_EVENT_OCCURRED0_LH1_HCERR_SHFT 4
+#define UV2H_EVENT_OCCURRED0_GR0_HCERR_SHFT 5
+#define UV2H_EVENT_OCCURRED0_GR1_HCERR_SHFT 6
+#define UV2H_EVENT_OCCURRED0_NI0_HCERR_SHFT 7
+#define UV2H_EVENT_OCCURRED0_NI1_HCERR_SHFT 8
+#define UV2H_EVENT_OCCURRED0_LB_AOERR0_SHFT 9
+#define UV2H_EVENT_OCCURRED0_QP_AOERR0_SHFT 10
+#define UV2H_EVENT_OCCURRED0_RH_AOERR0_SHFT 11
+#define UV2H_EVENT_OCCURRED0_LH0_AOERR0_SHFT 12
+#define UV2H_EVENT_OCCURRED0_LH1_AOERR0_SHFT 13
+#define UV2H_EVENT_OCCURRED0_GR0_AOERR0_SHFT 14
+#define UV2H_EVENT_OCCURRED0_GR1_AOERR0_SHFT 15
+#define UV2H_EVENT_OCCURRED0_XB_AOERR0_SHFT 16
+#define UV2H_EVENT_OCCURRED0_RT_AOERR0_SHFT 17
+#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_SHFT 18
+#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_SHFT 19
+#define UV2H_EVENT_OCCURRED0_LB_AOERR1_SHFT 20
+#define UV2H_EVENT_OCCURRED0_QP_AOERR1_SHFT 21
+#define UV2H_EVENT_OCCURRED0_RH_AOERR1_SHFT 22
+#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_SHFT 23
+#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_SHFT 24
+#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 25
+#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 26
+#define UV2H_EVENT_OCCURRED0_XB_AOERR1_SHFT 27
+#define UV2H_EVENT_OCCURRED0_RT_AOERR1_SHFT 28
+#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_SHFT 29
+#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_SHFT 30
+#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 31
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 32
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 33
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 34
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 35
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 36
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 37
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 38
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 39
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 40
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 41
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 42
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 43
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 44
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 45
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 46
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 47
+#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 48
+#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 49
+#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 50
+#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 51
+#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 52
+#define UV2H_EVENT_OCCURRED0_IPI_INT_SHFT 53
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 54
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 55
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 56
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 57
+#define UV2H_EVENT_OCCURRED0_PROFILE_INT_SHFT 58
+#define UV2H_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL
+#define UV2H_EVENT_OCCURRED0_QP_HCERR_MASK 0x0000000000000002UL
+#define UV2H_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000004UL
+#define UV2H_EVENT_OCCURRED0_LH0_HCERR_MASK 0x0000000000000008UL
+#define UV2H_EVENT_OCCURRED0_LH1_HCERR_MASK 0x0000000000000010UL
+#define UV2H_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000020UL
+#define UV2H_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000040UL
+#define UV2H_EVENT_OCCURRED0_NI0_HCERR_MASK 0x0000000000000080UL
+#define UV2H_EVENT_OCCURRED0_NI1_HCERR_MASK 0x0000000000000100UL
+#define UV2H_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000200UL
+#define UV2H_EVENT_OCCURRED0_QP_AOERR0_MASK 0x0000000000000400UL
+#define UV2H_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL
+#define UV2H_EVENT_OCCURRED0_LH0_AOERR0_MASK 0x0000000000001000UL
+#define UV2H_EVENT_OCCURRED0_LH1_AOERR0_MASK 0x0000000000002000UL
+#define UV2H_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000004000UL
+#define UV2H_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000008000UL
+#define UV2H_EVENT_OCCURRED0_XB_AOERR0_MASK 0x0000000000010000UL
+#define UV2H_EVENT_OCCURRED0_RT_AOERR0_MASK 0x0000000000020000UL
+#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000000040000UL
+#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000000080000UL
+#define UV2H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000100000UL
+#define UV2H_EVENT_OCCURRED0_QP_AOERR1_MASK 0x0000000000200000UL
+#define UV2H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000400000UL
+#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000000800000UL
+#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000001000000UL
+#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000002000000UL
+#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000004000000UL
+#define UV2H_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000008000000UL
+#define UV2H_EVENT_OCCURRED0_RT_AOERR1_MASK 0x0000000010000000UL
+#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000000020000000UL
+#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000000040000000UL
+#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000080000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000100000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000200000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000400000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000800000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000001000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000002000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000004000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000008000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000010000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000020000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000040000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000080000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000100000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000200000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000400000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000800000000000UL
+#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0001000000000000UL
+#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0002000000000000UL
+#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0004000000000000UL
+#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0008000000000000UL
+#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0010000000000000UL
+#define UV2H_EVENT_OCCURRED0_IPI_INT_MASK 0x0020000000000000UL
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0040000000000000UL
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0080000000000000UL
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0100000000000000UL
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0200000000000000UL
+#define UV2H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0400000000000000UL
+
union uvh_event_occurred0_u {
- unsigned long v;
- struct uvh_event_occurred0_s {
- unsigned long lb_hcerr : 1; /* RW, W1C */
- unsigned long gr0_hcerr : 1; /* RW, W1C */
- unsigned long gr1_hcerr : 1; /* RW, W1C */
- unsigned long lh_hcerr : 1; /* RW, W1C */
- unsigned long rh_hcerr : 1; /* RW, W1C */
- unsigned long xn_hcerr : 1; /* RW, W1C */
- unsigned long si_hcerr : 1; /* RW, W1C */
- unsigned long lb_aoerr0 : 1; /* RW, W1C */
- unsigned long gr0_aoerr0 : 1; /* RW, W1C */
- unsigned long gr1_aoerr0 : 1; /* RW, W1C */
- unsigned long lh_aoerr0 : 1; /* RW, W1C */
- unsigned long rh_aoerr0 : 1; /* RW, W1C */
- unsigned long xn_aoerr0 : 1; /* RW, W1C */
- unsigned long si_aoerr0 : 1; /* RW, W1C */
- unsigned long lb_aoerr1 : 1; /* RW, W1C */
- unsigned long gr0_aoerr1 : 1; /* RW, W1C */
- unsigned long gr1_aoerr1 : 1; /* RW, W1C */
- unsigned long lh_aoerr1 : 1; /* RW, W1C */
- unsigned long rh_aoerr1 : 1; /* RW, W1C */
- unsigned long xn_aoerr1 : 1; /* RW, W1C */
- unsigned long si_aoerr1 : 1; /* RW, W1C */
- unsigned long rh_vpi_int : 1; /* RW, W1C */
- unsigned long system_shutdown_int : 1; /* RW, W1C */
- unsigned long lb_irq_int_0 : 1; /* RW, W1C */
- unsigned long lb_irq_int_1 : 1; /* RW, W1C */
- unsigned long lb_irq_int_2 : 1; /* RW, W1C */
- unsigned long lb_irq_int_3 : 1; /* RW, W1C */
- unsigned long lb_irq_int_4 : 1; /* RW, W1C */
- unsigned long lb_irq_int_5 : 1; /* RW, W1C */
- unsigned long lb_irq_int_6 : 1; /* RW, W1C */
- unsigned long lb_irq_int_7 : 1; /* RW, W1C */
- unsigned long lb_irq_int_8 : 1; /* RW, W1C */
- unsigned long lb_irq_int_9 : 1; /* RW, W1C */
- unsigned long lb_irq_int_10 : 1; /* RW, W1C */
- unsigned long lb_irq_int_11 : 1; /* RW, W1C */
- unsigned long lb_irq_int_12 : 1; /* RW, W1C */
- unsigned long lb_irq_int_13 : 1; /* RW, W1C */
- unsigned long lb_irq_int_14 : 1; /* RW, W1C */
- unsigned long lb_irq_int_15 : 1; /* RW, W1C */
- unsigned long l1_nmi_int : 1; /* RW, W1C */
- unsigned long stop_clock : 1; /* RW, W1C */
- unsigned long asic_to_l1 : 1; /* RW, W1C */
- unsigned long l1_to_asic : 1; /* RW, W1C */
- unsigned long ltc_int : 1; /* RW, W1C */
- unsigned long la_seq_trigger : 1; /* RW, W1C */
- unsigned long ipi_int : 1; /* RW, W1C */
- unsigned long extio_int0 : 1; /* RW, W1C */
- unsigned long extio_int1 : 1; /* RW, W1C */
- unsigned long extio_int2 : 1; /* RW, W1C */
- unsigned long extio_int3 : 1; /* RW, W1C */
- unsigned long profile_int : 1; /* RW, W1C */
- unsigned long rtc0 : 1; /* RW, W1C */
- unsigned long rtc1 : 1; /* RW, W1C */
- unsigned long rtc2 : 1; /* RW, W1C */
- unsigned long rtc3 : 1; /* RW, W1C */
- unsigned long bau_data : 1; /* RW, W1C */
- unsigned long power_management_req : 1; /* RW, W1C */
- unsigned long rsvd_57_63 : 7; /* */
- } s;
+ unsigned long v;
+ struct uv1h_event_occurred0_s {
+ unsigned long lb_hcerr:1; /* RW, W1C */
+ unsigned long gr0_hcerr:1; /* RW, W1C */
+ unsigned long gr1_hcerr:1; /* RW, W1C */
+ unsigned long lh_hcerr:1; /* RW, W1C */
+ unsigned long rh_hcerr:1; /* RW, W1C */
+ unsigned long xn_hcerr:1; /* RW, W1C */
+ unsigned long si_hcerr:1; /* RW, W1C */
+ unsigned long lb_aoerr0:1; /* RW, W1C */
+ unsigned long gr0_aoerr0:1; /* RW, W1C */
+ unsigned long gr1_aoerr0:1; /* RW, W1C */
+ unsigned long lh_aoerr0:1; /* RW, W1C */
+ unsigned long rh_aoerr0:1; /* RW, W1C */
+ unsigned long xn_aoerr0:1; /* RW, W1C */
+ unsigned long si_aoerr0:1; /* RW, W1C */
+ unsigned long lb_aoerr1:1; /* RW, W1C */
+ unsigned long gr0_aoerr1:1; /* RW, W1C */
+ unsigned long gr1_aoerr1:1; /* RW, W1C */
+ unsigned long lh_aoerr1:1; /* RW, W1C */
+ unsigned long rh_aoerr1:1; /* RW, W1C */
+ unsigned long xn_aoerr1:1; /* RW, W1C */
+ unsigned long si_aoerr1:1; /* RW, W1C */
+ unsigned long rh_vpi_int:1; /* RW, W1C */
+ unsigned long system_shutdown_int:1; /* RW, W1C */
+ unsigned long lb_irq_int_0:1; /* RW, W1C */
+ unsigned long lb_irq_int_1:1; /* RW, W1C */
+ unsigned long lb_irq_int_2:1; /* RW, W1C */
+ unsigned long lb_irq_int_3:1; /* RW, W1C */
+ unsigned long lb_irq_int_4:1; /* RW, W1C */
+ unsigned long lb_irq_int_5:1; /* RW, W1C */
+ unsigned long lb_irq_int_6:1; /* RW, W1C */
+ unsigned long lb_irq_int_7:1; /* RW, W1C */
+ unsigned long lb_irq_int_8:1; /* RW, W1C */
+ unsigned long lb_irq_int_9:1; /* RW, W1C */
+ unsigned long lb_irq_int_10:1; /* RW, W1C */
+ unsigned long lb_irq_int_11:1; /* RW, W1C */
+ unsigned long lb_irq_int_12:1; /* RW, W1C */
+ unsigned long lb_irq_int_13:1; /* RW, W1C */
+ unsigned long lb_irq_int_14:1; /* RW, W1C */
+ unsigned long lb_irq_int_15:1; /* RW, W1C */
+ unsigned long l1_nmi_int:1; /* RW, W1C */
+ unsigned long stop_clock:1; /* RW, W1C */
+ unsigned long asic_to_l1:1; /* RW, W1C */
+ unsigned long l1_to_asic:1; /* RW, W1C */
+ unsigned long ltc_int:1; /* RW, W1C */
+ unsigned long la_seq_trigger:1; /* RW, W1C */
+ unsigned long ipi_int:1; /* RW, W1C */
+ unsigned long extio_int0:1; /* RW, W1C */
+ unsigned long extio_int1:1; /* RW, W1C */
+ unsigned long extio_int2:1; /* RW, W1C */
+ unsigned long extio_int3:1; /* RW, W1C */
+ unsigned long profile_int:1; /* RW, W1C */
+ unsigned long rtc0:1; /* RW, W1C */
+ unsigned long rtc1:1; /* RW, W1C */
+ unsigned long rtc2:1; /* RW, W1C */
+ unsigned long rtc3:1; /* RW, W1C */
+ unsigned long bau_data:1; /* RW, W1C */
+ unsigned long power_management_req:1; /* RW, W1C */
+ unsigned long rsvd_57_63:7;
+ } s1;
+ struct uv2h_event_occurred0_s {
+ unsigned long lb_hcerr:1; /* RW */
+ unsigned long qp_hcerr:1; /* RW */
+ unsigned long rh_hcerr:1; /* RW */
+ unsigned long lh0_hcerr:1; /* RW */
+ unsigned long lh1_hcerr:1; /* RW */
+ unsigned long gr0_hcerr:1; /* RW */
+ unsigned long gr1_hcerr:1; /* RW */
+ unsigned long ni0_hcerr:1; /* RW */
+ unsigned long ni1_hcerr:1; /* RW */
+ unsigned long lb_aoerr0:1; /* RW */
+ unsigned long qp_aoerr0:1; /* RW */
+ unsigned long rh_aoerr0:1; /* RW */
+ unsigned long lh0_aoerr0:1; /* RW */
+ unsigned long lh1_aoerr0:1; /* RW */
+ unsigned long gr0_aoerr0:1; /* RW */
+ unsigned long gr1_aoerr0:1; /* RW */
+ unsigned long xb_aoerr0:1; /* RW */
+ unsigned long rt_aoerr0:1; /* RW */
+ unsigned long ni0_aoerr0:1; /* RW */
+ unsigned long ni1_aoerr0:1; /* RW */
+ unsigned long lb_aoerr1:1; /* RW */
+ unsigned long qp_aoerr1:1; /* RW */
+ unsigned long rh_aoerr1:1; /* RW */
+ unsigned long lh0_aoerr1:1; /* RW */
+ unsigned long lh1_aoerr1:1; /* RW */
+ unsigned long gr0_aoerr1:1; /* RW */
+ unsigned long gr1_aoerr1:1; /* RW */
+ unsigned long xb_aoerr1:1; /* RW */
+ unsigned long rt_aoerr1:1; /* RW */
+ unsigned long ni0_aoerr1:1; /* RW */
+ unsigned long ni1_aoerr1:1; /* RW */
+ unsigned long system_shutdown_int:1; /* RW */
+ unsigned long lb_irq_int_0:1; /* RW */
+ unsigned long lb_irq_int_1:1; /* RW */
+ unsigned long lb_irq_int_2:1; /* RW */
+ unsigned long lb_irq_int_3:1; /* RW */
+ unsigned long lb_irq_int_4:1; /* RW */
+ unsigned long lb_irq_int_5:1; /* RW */
+ unsigned long lb_irq_int_6:1; /* RW */
+ unsigned long lb_irq_int_7:1; /* RW */
+ unsigned long lb_irq_int_8:1; /* RW */
+ unsigned long lb_irq_int_9:1; /* RW */
+ unsigned long lb_irq_int_10:1; /* RW */
+ unsigned long lb_irq_int_11:1; /* RW */
+ unsigned long lb_irq_int_12:1; /* RW */
+ unsigned long lb_irq_int_13:1; /* RW */
+ unsigned long lb_irq_int_14:1; /* RW */
+ unsigned long lb_irq_int_15:1; /* RW */
+ unsigned long l1_nmi_int:1; /* RW */
+ unsigned long stop_clock:1; /* RW */
+ unsigned long asic_to_l1:1; /* RW */
+ unsigned long l1_to_asic:1; /* RW */
+ unsigned long la_seq_trigger:1; /* RW */
+ unsigned long ipi_int:1; /* RW */
+ unsigned long extio_int0:1; /* RW */
+ unsigned long extio_int1:1; /* RW */
+ unsigned long extio_int2:1; /* RW */
+ unsigned long extio_int3:1; /* RW */
+ unsigned long profile_int:1; /* RW */
+ unsigned long rsvd_59_63:5;
+ } s2;
};
/* ========================================================================= */
/* UVH_EVENT_OCCURRED0_ALIAS */
/* ========================================================================= */
-#define UVH_EVENT_OCCURRED0_ALIAS 0x0000000000070008UL
-#define UVH_EVENT_OCCURRED0_ALIAS_32 0x005f0
+#define UVH_EVENT_OCCURRED0_ALIAS 0x0000000000070008UL
+#define UVH_EVENT_OCCURRED0_ALIAS_32 0x5f0
/* ========================================================================= */
/* UVH_GR0_TLB_INT0_CONFIG */
/* ========================================================================= */
-#define UVH_GR0_TLB_INT0_CONFIG 0x61b00UL
-
-#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_SHFT 0
-#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_GR0_TLB_INT0_CONFIG_DM_SHFT 8
-#define UVH_GR0_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_SHFT 11
-#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_GR0_TLB_INT0_CONFIG_STATUS_SHFT 12
-#define UVH_GR0_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_GR0_TLB_INT0_CONFIG_P_SHFT 13
-#define UVH_GR0_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_GR0_TLB_INT0_CONFIG_T_SHFT 15
-#define UVH_GR0_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_GR0_TLB_INT0_CONFIG_M_SHFT 16
-#define UVH_GR0_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_SHFT 32
-#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+#define UVH_GR0_TLB_INT0_CONFIG 0x61b00UL
+
+#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_SHFT 0
+#define UVH_GR0_TLB_INT0_CONFIG_DM_SHFT 8
+#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_SHFT 11
+#define UVH_GR0_TLB_INT0_CONFIG_STATUS_SHFT 12
+#define UVH_GR0_TLB_INT0_CONFIG_P_SHFT 13
+#define UVH_GR0_TLB_INT0_CONFIG_T_SHFT 15
+#define UVH_GR0_TLB_INT0_CONFIG_M_SHFT 16
+#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_SHFT 32
+#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_GR0_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_GR0_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_GR0_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_GR0_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_GR0_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
union uvh_gr0_tlb_int0_config_u {
- unsigned long v;
- struct uvh_gr0_tlb_int0_config_s {
- unsigned long vector_ : 8; /* RW */
- unsigned long dm : 3; /* RW */
- unsigned long destmode : 1; /* RW */
- unsigned long status : 1; /* RO */
- unsigned long p : 1; /* RO */
- unsigned long rsvd_14 : 1; /* */
- unsigned long t : 1; /* RO */
- unsigned long m : 1; /* RW */
- unsigned long rsvd_17_31: 15; /* */
- unsigned long apic_id : 32; /* RW */
- } s;
+ unsigned long v;
+ struct uvh_gr0_tlb_int0_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s;
};
/* ========================================================================= */
/* UVH_GR0_TLB_INT1_CONFIG */
/* ========================================================================= */
-#define UVH_GR0_TLB_INT1_CONFIG 0x61b40UL
-
-#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_SHFT 0
-#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_GR0_TLB_INT1_CONFIG_DM_SHFT 8
-#define UVH_GR0_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_SHFT 11
-#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_GR0_TLB_INT1_CONFIG_STATUS_SHFT 12
-#define UVH_GR0_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_GR0_TLB_INT1_CONFIG_P_SHFT 13
-#define UVH_GR0_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_GR0_TLB_INT1_CONFIG_T_SHFT 15
-#define UVH_GR0_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_GR0_TLB_INT1_CONFIG_M_SHFT 16
-#define UVH_GR0_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_SHFT 32
-#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+#define UVH_GR0_TLB_INT1_CONFIG 0x61b40UL
+
+#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_SHFT 0
+#define UVH_GR0_TLB_INT1_CONFIG_DM_SHFT 8
+#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_SHFT 11
+#define UVH_GR0_TLB_INT1_CONFIG_STATUS_SHFT 12
+#define UVH_GR0_TLB_INT1_CONFIG_P_SHFT 13
+#define UVH_GR0_TLB_INT1_CONFIG_T_SHFT 15
+#define UVH_GR0_TLB_INT1_CONFIG_M_SHFT 16
+#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_SHFT 32
+#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_GR0_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_GR0_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_GR0_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_GR0_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_GR0_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
union uvh_gr0_tlb_int1_config_u {
- unsigned long v;
- struct uvh_gr0_tlb_int1_config_s {
- unsigned long vector_ : 8; /* RW */
- unsigned long dm : 3; /* RW */
- unsigned long destmode : 1; /* RW */
- unsigned long status : 1; /* RO */
- unsigned long p : 1; /* RO */
- unsigned long rsvd_14 : 1; /* */
- unsigned long t : 1; /* RO */
- unsigned long m : 1; /* RW */
- unsigned long rsvd_17_31: 15; /* */
- unsigned long apic_id : 32; /* RW */
- } s;
+ unsigned long v;
+ struct uvh_gr0_tlb_int1_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_GR0_TLB_MMR_CONTROL */
+/* ========================================================================= */
+#define UV1H_GR0_TLB_MMR_CONTROL 0x401080UL
+#define UV2H_GR0_TLB_MMR_CONTROL 0xc01080UL
+#define UVH_GR0_TLB_MMR_CONTROL (is_uv1_hub() ? \
+ UV1H_GR0_TLB_MMR_CONTROL : \
+ UV2H_GR0_TLB_MMR_CONTROL)
+
+#define UVH_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0
+#define UVH_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 12
+#define UVH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16
+#define UVH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20
+#define UVH_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30
+#define UVH_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31
+#define UVH_GR0_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL
+#define UVH_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL
+#define UVH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL
+#define UVH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL
+#define UVH_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL
+#define UVH_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL
+
+#define UV1H_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0
+#define UV1H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 12
+#define UV1H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_CON_SHFT 48
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_SHFT 52
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBPGSIZE_SHFT 54
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_SHFT 56
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_SHFT 60
+#define UV1H_GR0_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL
+#define UV1H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_CON_MASK 0x0001000000000000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK 0x0010000000000000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBPGSIZE_MASK 0x0040000000000000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_MASK 0x0100000000000000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_MASK 0x1000000000000000UL
+
+#define UV2H_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0
+#define UV2H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 12
+#define UV2H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_CON_SHFT 48
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_SHFT 52
+#define UV2H_GR0_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL
+#define UV2H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL
+#define UV2H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_CON_MASK 0x0001000000000000UL
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK 0x0010000000000000UL
+
+union uvh_gr0_tlb_mmr_control_u {
+ unsigned long v;
+ struct uvh_gr0_tlb_mmr_control_s {
+ unsigned long index:12; /* RW */
+ unsigned long mem_sel:2; /* RW */
+ unsigned long rsvd_14_15:2;
+ unsigned long auto_valid_en:1; /* RW */
+ unsigned long rsvd_17_19:3;
+ unsigned long mmr_hash_index_en:1; /* RW */
+ unsigned long rsvd_21_29:9;
+ unsigned long mmr_write:1; /* WP */
+ unsigned long mmr_read:1; /* WP */
+ unsigned long rsvd_32_63:32;
+ } s;
+ struct uv1h_gr0_tlb_mmr_control_s {
+ unsigned long index:12; /* RW */
+ unsigned long mem_sel:2; /* RW */
+ unsigned long rsvd_14_15:2;
+ unsigned long auto_valid_en:1; /* RW */
+ unsigned long rsvd_17_19:3;
+ unsigned long mmr_hash_index_en:1; /* RW */
+ unsigned long rsvd_21_29:9;
+ unsigned long mmr_write:1; /* WP */
+ unsigned long mmr_read:1; /* WP */
+ unsigned long rsvd_32_47:16;
+ unsigned long mmr_inj_con:1; /* RW */
+ unsigned long rsvd_49_51:3;
+ unsigned long mmr_inj_tlbram:1; /* RW */
+ unsigned long rsvd_53:1;
+ unsigned long mmr_inj_tlbpgsize:1; /* RW */
+ unsigned long rsvd_55:1;
+ unsigned long mmr_inj_tlbrreg:1; /* RW */
+ unsigned long rsvd_57_59:3;
+ unsigned long mmr_inj_tlblruv:1; /* RW */
+ unsigned long rsvd_61_63:3;
+ } s1;
+ struct uv2h_gr0_tlb_mmr_control_s {
+ unsigned long index:12; /* RW */
+ unsigned long mem_sel:2; /* RW */
+ unsigned long rsvd_14_15:2;
+ unsigned long auto_valid_en:1; /* RW */
+ unsigned long rsvd_17_19:3;
+ unsigned long mmr_hash_index_en:1; /* RW */
+ unsigned long rsvd_21_29:9;
+ unsigned long mmr_write:1; /* WP */
+ unsigned long mmr_read:1; /* WP */
+ unsigned long mmr_op_done:1; /* RW */
+ unsigned long rsvd_33_47:15;
+ unsigned long mmr_inj_con:1; /* RW */
+ unsigned long rsvd_49_51:3;
+ unsigned long mmr_inj_tlbram:1; /* RW */
+ unsigned long rsvd_53_63:11;
+ } s2;
+};
+
+/* ========================================================================= */
+/* UVH_GR0_TLB_MMR_READ_DATA_HI */
+/* ========================================================================= */
+#define UV1H_GR0_TLB_MMR_READ_DATA_HI 0x4010a0UL
+#define UV2H_GR0_TLB_MMR_READ_DATA_HI 0xc010a0UL
+#define UVH_GR0_TLB_MMR_READ_DATA_HI (is_uv1_hub() ? \
+ UV1H_GR0_TLB_MMR_READ_DATA_HI : \
+ UV2H_GR0_TLB_MMR_READ_DATA_HI)
+
+#define UVH_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0
+#define UVH_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT 41
+#define UVH_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43
+#define UVH_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44
+#define UVH_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL
+#define UVH_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL
+#define UVH_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL
+#define UVH_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL
+
+union uvh_gr0_tlb_mmr_read_data_hi_u {
+ unsigned long v;
+ struct uvh_gr0_tlb_mmr_read_data_hi_s {
+ unsigned long pfn:41; /* RO */
+ unsigned long gaa:2; /* RO */
+ unsigned long dirty:1; /* RO */
+ unsigned long larger:1; /* RO */
+ unsigned long rsvd_45_63:19;
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_GR0_TLB_MMR_READ_DATA_LO */
+/* ========================================================================= */
+#define UV1H_GR0_TLB_MMR_READ_DATA_LO 0x4010a8UL
+#define UV2H_GR0_TLB_MMR_READ_DATA_LO 0xc010a8UL
+#define UVH_GR0_TLB_MMR_READ_DATA_LO (is_uv1_hub() ? \
+ UV1H_GR0_TLB_MMR_READ_DATA_LO : \
+ UV2H_GR0_TLB_MMR_READ_DATA_LO)
+
+#define UVH_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT 0
+#define UVH_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT 39
+#define UVH_GR0_TLB_MMR_READ_DATA_LO_VALID_SHFT 63
+#define UVH_GR0_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL
+#define UVH_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL
+#define UVH_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL
+
+union uvh_gr0_tlb_mmr_read_data_lo_u {
+ unsigned long v;
+ struct uvh_gr0_tlb_mmr_read_data_lo_s {
+ unsigned long vpn:39; /* RO */
+ unsigned long asid:24; /* RO */
+ unsigned long valid:1; /* RO */
+ } s;
};
/* ========================================================================= */
/* UVH_GR1_TLB_INT0_CONFIG */
/* ========================================================================= */
-#define UVH_GR1_TLB_INT0_CONFIG 0x61f00UL
-
-#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_SHFT 0
-#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_GR1_TLB_INT0_CONFIG_DM_SHFT 8
-#define UVH_GR1_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_SHFT 11
-#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_GR1_TLB_INT0_CONFIG_STATUS_SHFT 12
-#define UVH_GR1_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_GR1_TLB_INT0_CONFIG_P_SHFT 13
-#define UVH_GR1_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_GR1_TLB_INT0_CONFIG_T_SHFT 15
-#define UVH_GR1_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_GR1_TLB_INT0_CONFIG_M_SHFT 16
-#define UVH_GR1_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_SHFT 32
-#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+#define UVH_GR1_TLB_INT0_CONFIG 0x61f00UL
+
+#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_SHFT 0
+#define UVH_GR1_TLB_INT0_CONFIG_DM_SHFT 8
+#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_SHFT 11
+#define UVH_GR1_TLB_INT0_CONFIG_STATUS_SHFT 12
+#define UVH_GR1_TLB_INT0_CONFIG_P_SHFT 13
+#define UVH_GR1_TLB_INT0_CONFIG_T_SHFT 15
+#define UVH_GR1_TLB_INT0_CONFIG_M_SHFT 16
+#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_SHFT 32
+#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_GR1_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_GR1_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_GR1_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_GR1_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_GR1_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
union uvh_gr1_tlb_int0_config_u {
- unsigned long v;
- struct uvh_gr1_tlb_int0_config_s {
- unsigned long vector_ : 8; /* RW */
- unsigned long dm : 3; /* RW */
- unsigned long destmode : 1; /* RW */
- unsigned long status : 1; /* RO */
- unsigned long p : 1; /* RO */
- unsigned long rsvd_14 : 1; /* */
- unsigned long t : 1; /* RO */
- unsigned long m : 1; /* RW */
- unsigned long rsvd_17_31: 15; /* */
- unsigned long apic_id : 32; /* RW */
- } s;
+ unsigned long v;
+ struct uvh_gr1_tlb_int0_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s;
};
/* ========================================================================= */
/* UVH_GR1_TLB_INT1_CONFIG */
/* ========================================================================= */
-#define UVH_GR1_TLB_INT1_CONFIG 0x61f40UL
-
-#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_SHFT 0
-#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_GR1_TLB_INT1_CONFIG_DM_SHFT 8
-#define UVH_GR1_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_SHFT 11
-#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_GR1_TLB_INT1_CONFIG_STATUS_SHFT 12
-#define UVH_GR1_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_GR1_TLB_INT1_CONFIG_P_SHFT 13
-#define UVH_GR1_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_GR1_TLB_INT1_CONFIG_T_SHFT 15
-#define UVH_GR1_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_GR1_TLB_INT1_CONFIG_M_SHFT 16
-#define UVH_GR1_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_SHFT 32
-#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+#define UVH_GR1_TLB_INT1_CONFIG 0x61f40UL
+
+#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_SHFT 0
+#define UVH_GR1_TLB_INT1_CONFIG_DM_SHFT 8
+#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_SHFT 11
+#define UVH_GR1_TLB_INT1_CONFIG_STATUS_SHFT 12
+#define UVH_GR1_TLB_INT1_CONFIG_P_SHFT 13
+#define UVH_GR1_TLB_INT1_CONFIG_T_SHFT 15
+#define UVH_GR1_TLB_INT1_CONFIG_M_SHFT 16
+#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_SHFT 32
+#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_GR1_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_GR1_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_GR1_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_GR1_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_GR1_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
union uvh_gr1_tlb_int1_config_u {
- unsigned long v;
- struct uvh_gr1_tlb_int1_config_s {
- unsigned long vector_ : 8; /* RW */
- unsigned long dm : 3; /* RW */
- unsigned long destmode : 1; /* RW */
- unsigned long status : 1; /* RO */
- unsigned long p : 1; /* RO */
- unsigned long rsvd_14 : 1; /* */
- unsigned long t : 1; /* RO */
- unsigned long m : 1; /* RW */
- unsigned long rsvd_17_31: 15; /* */
- unsigned long apic_id : 32; /* RW */
- } s;
+ unsigned long v;
+ struct uvh_gr1_tlb_int1_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_GR1_TLB_MMR_CONTROL */
+/* ========================================================================= */
+#define UV1H_GR1_TLB_MMR_CONTROL 0x801080UL
+#define UV2H_GR1_TLB_MMR_CONTROL 0x1001080UL
+#define UVH_GR1_TLB_MMR_CONTROL (is_uv1_hub() ? \
+ UV1H_GR1_TLB_MMR_CONTROL : \
+ UV2H_GR1_TLB_MMR_CONTROL)
+
+#define UVH_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0
+#define UVH_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 12
+#define UVH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16
+#define UVH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20
+#define UVH_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30
+#define UVH_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31
+#define UVH_GR1_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL
+#define UVH_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL
+#define UVH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL
+#define UVH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL
+#define UVH_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL
+#define UVH_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL
+
+#define UV1H_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0
+#define UV1H_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 12
+#define UV1H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_CON_SHFT 48
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_SHFT 52
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBPGSIZE_SHFT 54
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_SHFT 56
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_SHFT 60
+#define UV1H_GR1_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL
+#define UV1H_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_CON_MASK 0x0001000000000000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK 0x0010000000000000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBPGSIZE_MASK 0x0040000000000000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_MASK 0x0100000000000000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_MASK 0x1000000000000000UL
+
+#define UV2H_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0
+#define UV2H_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 12
+#define UV2H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_CON_SHFT 48
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_SHFT 52
+#define UV2H_GR1_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL
+#define UV2H_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL
+#define UV2H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_CON_MASK 0x0001000000000000UL
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK 0x0010000000000000UL
+
+union uvh_gr1_tlb_mmr_control_u {
+ unsigned long v;
+ struct uvh_gr1_tlb_mmr_control_s {
+ unsigned long index:12; /* RW */
+ unsigned long mem_sel:2; /* RW */
+ unsigned long rsvd_14_15:2;
+ unsigned long auto_valid_en:1; /* RW */
+ unsigned long rsvd_17_19:3;
+ unsigned long mmr_hash_index_en:1; /* RW */
+ unsigned long rsvd_21_29:9;
+ unsigned long mmr_write:1; /* WP */
+ unsigned long mmr_read:1; /* WP */
+ unsigned long rsvd_32_63:32;
+ } s;
+ struct uv1h_gr1_tlb_mmr_control_s {
+ unsigned long index:12; /* RW */
+ unsigned long mem_sel:2; /* RW */
+ unsigned long rsvd_14_15:2;
+ unsigned long auto_valid_en:1; /* RW */
+ unsigned long rsvd_17_19:3;
+ unsigned long mmr_hash_index_en:1; /* RW */
+ unsigned long rsvd_21_29:9;
+ unsigned long mmr_write:1; /* WP */
+ unsigned long mmr_read:1; /* WP */
+ unsigned long rsvd_32_47:16;
+ unsigned long mmr_inj_con:1; /* RW */
+ unsigned long rsvd_49_51:3;
+ unsigned long mmr_inj_tlbram:1; /* RW */
+ unsigned long rsvd_53:1;
+ unsigned long mmr_inj_tlbpgsize:1; /* RW */
+ unsigned long rsvd_55:1;
+ unsigned long mmr_inj_tlbrreg:1; /* RW */
+ unsigned long rsvd_57_59:3;
+ unsigned long mmr_inj_tlblruv:1; /* RW */
+ unsigned long rsvd_61_63:3;
+ } s1;
+ struct uv2h_gr1_tlb_mmr_control_s {
+ unsigned long index:12; /* RW */
+ unsigned long mem_sel:2; /* RW */
+ unsigned long rsvd_14_15:2;
+ unsigned long auto_valid_en:1; /* RW */
+ unsigned long rsvd_17_19:3;
+ unsigned long mmr_hash_index_en:1; /* RW */
+ unsigned long rsvd_21_29:9;
+ unsigned long mmr_write:1; /* WP */
+ unsigned long mmr_read:1; /* WP */
+ unsigned long mmr_op_done:1; /* RW */
+ unsigned long rsvd_33_47:15;
+ unsigned long mmr_inj_con:1; /* RW */
+ unsigned long rsvd_49_51:3;
+ unsigned long mmr_inj_tlbram:1; /* RW */
+ unsigned long rsvd_53_63:11;
+ } s2;
+};
+
+/* ========================================================================= */
+/* UVH_GR1_TLB_MMR_READ_DATA_HI */
+/* ========================================================================= */
+#define UV1H_GR1_TLB_MMR_READ_DATA_HI 0x8010a0UL
+#define UV2H_GR1_TLB_MMR_READ_DATA_HI 0x10010a0UL
+#define UVH_GR1_TLB_MMR_READ_DATA_HI (is_uv1_hub() ? \
+ UV1H_GR1_TLB_MMR_READ_DATA_HI : \
+ UV2H_GR1_TLB_MMR_READ_DATA_HI)
+
+#define UVH_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0
+#define UVH_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT 41
+#define UVH_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43
+#define UVH_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44
+#define UVH_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL
+#define UVH_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL
+#define UVH_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL
+#define UVH_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL
+
+union uvh_gr1_tlb_mmr_read_data_hi_u {
+ unsigned long v;
+ struct uvh_gr1_tlb_mmr_read_data_hi_s {
+ unsigned long pfn:41; /* RO */
+ unsigned long gaa:2; /* RO */
+ unsigned long dirty:1; /* RO */
+ unsigned long larger:1; /* RO */
+ unsigned long rsvd_45_63:19;
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_GR1_TLB_MMR_READ_DATA_LO */
+/* ========================================================================= */
+#define UV1H_GR1_TLB_MMR_READ_DATA_LO 0x8010a8UL
+#define UV2H_GR1_TLB_MMR_READ_DATA_LO 0x10010a8UL
+#define UVH_GR1_TLB_MMR_READ_DATA_LO (is_uv1_hub() ? \
+ UV1H_GR1_TLB_MMR_READ_DATA_LO : \
+ UV2H_GR1_TLB_MMR_READ_DATA_LO)
+
+#define UVH_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT 0
+#define UVH_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT 39
+#define UVH_GR1_TLB_MMR_READ_DATA_LO_VALID_SHFT 63
+#define UVH_GR1_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL
+#define UVH_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL
+#define UVH_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL
+
+union uvh_gr1_tlb_mmr_read_data_lo_u {
+ unsigned long v;
+ struct uvh_gr1_tlb_mmr_read_data_lo_s {
+ unsigned long vpn:39; /* RO */
+ unsigned long asid:24; /* RO */
+ unsigned long valid:1; /* RO */
+ } s;
};
/* ========================================================================= */
/* UVH_INT_CMPB */
/* ========================================================================= */
-#define UVH_INT_CMPB 0x22080UL
+#define UVH_INT_CMPB 0x22080UL
-#define UVH_INT_CMPB_REAL_TIME_CMPB_SHFT 0
-#define UVH_INT_CMPB_REAL_TIME_CMPB_MASK 0x00ffffffffffffffUL
+#define UVH_INT_CMPB_REAL_TIME_CMPB_SHFT 0
+#define UVH_INT_CMPB_REAL_TIME_CMPB_MASK 0x00ffffffffffffffUL
union uvh_int_cmpb_u {
- unsigned long v;
- struct uvh_int_cmpb_s {
- unsigned long real_time_cmpb : 56; /* RW */
- unsigned long rsvd_56_63 : 8; /* */
- } s;
+ unsigned long v;
+ struct uvh_int_cmpb_s {
+ unsigned long real_time_cmpb:56; /* RW */
+ unsigned long rsvd_56_63:8;
+ } s;
};
/* ========================================================================= */
/* UVH_INT_CMPC */
/* ========================================================================= */
-#define UVH_INT_CMPC 0x22100UL
+#define UVH_INT_CMPC 0x22100UL
-#define UVH_INT_CMPC_REAL_TIME_CMPC_SHFT 0
-#define UVH_INT_CMPC_REAL_TIME_CMPC_MASK 0x00ffffffffffffffUL
+#define UVH_INT_CMPC_REAL_TIME_CMPC_SHFT 0
+#define UVH_INT_CMPC_REAL_TIME_CMPC_MASK 0xffffffffffffffUL
union uvh_int_cmpc_u {
- unsigned long v;
- struct uvh_int_cmpc_s {
- unsigned long real_time_cmpc : 56; /* RW */
- unsigned long rsvd_56_63 : 8; /* */
- } s;
+ unsigned long v;
+ struct uvh_int_cmpc_s {
+ unsigned long real_time_cmpc:56; /* RW */
+ unsigned long rsvd_56_63:8;
+ } s;
};
/* ========================================================================= */
/* UVH_INT_CMPD */
/* ========================================================================= */
-#define UVH_INT_CMPD 0x22180UL
+#define UVH_INT_CMPD 0x22180UL
-#define UVH_INT_CMPD_REAL_TIME_CMPD_SHFT 0
-#define UVH_INT_CMPD_REAL_TIME_CMPD_MASK 0x00ffffffffffffffUL
+#define UVH_INT_CMPD_REAL_TIME_CMPD_SHFT 0
+#define UVH_INT_CMPD_REAL_TIME_CMPD_MASK 0xffffffffffffffUL
union uvh_int_cmpd_u {
- unsigned long v;
- struct uvh_int_cmpd_s {
- unsigned long real_time_cmpd : 56; /* RW */
- unsigned long rsvd_56_63 : 8; /* */
- } s;
+ unsigned long v;
+ struct uvh_int_cmpd_s {
+ unsigned long real_time_cmpd:56; /* RW */
+ unsigned long rsvd_56_63:8;
+ } s;
};
/* ========================================================================= */
/* UVH_IPI_INT */
/* ========================================================================= */
-#define UVH_IPI_INT 0x60500UL
-#define UVH_IPI_INT_32 0x0348
+#define UVH_IPI_INT 0x60500UL
+#define UVH_IPI_INT_32 0x348
-#define UVH_IPI_INT_VECTOR_SHFT 0
-#define UVH_IPI_INT_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_IPI_INT_DELIVERY_MODE_SHFT 8
-#define UVH_IPI_INT_DELIVERY_MODE_MASK 0x0000000000000700UL
-#define UVH_IPI_INT_DESTMODE_SHFT 11
-#define UVH_IPI_INT_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_IPI_INT_APIC_ID_SHFT 16
-#define UVH_IPI_INT_APIC_ID_MASK 0x0000ffffffff0000UL
-#define UVH_IPI_INT_SEND_SHFT 63
-#define UVH_IPI_INT_SEND_MASK 0x8000000000000000UL
+#define UVH_IPI_INT_VECTOR_SHFT 0
+#define UVH_IPI_INT_DELIVERY_MODE_SHFT 8
+#define UVH_IPI_INT_DESTMODE_SHFT 11
+#define UVH_IPI_INT_APIC_ID_SHFT 16
+#define UVH_IPI_INT_SEND_SHFT 63
+#define UVH_IPI_INT_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_IPI_INT_DELIVERY_MODE_MASK 0x0000000000000700UL
+#define UVH_IPI_INT_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_IPI_INT_APIC_ID_MASK 0x0000ffffffff0000UL
+#define UVH_IPI_INT_SEND_MASK 0x8000000000000000UL
union uvh_ipi_int_u {
- unsigned long v;
- struct uvh_ipi_int_s {
- unsigned long vector_ : 8; /* RW */
- unsigned long delivery_mode : 3; /* RW */
- unsigned long destmode : 1; /* RW */
- unsigned long rsvd_12_15 : 4; /* */
- unsigned long apic_id : 32; /* RW */
- unsigned long rsvd_48_62 : 15; /* */
- unsigned long send : 1; /* WP */
- } s;
+ unsigned long v;
+ struct uvh_ipi_int_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long delivery_mode:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long rsvd_12_15:4;
+ unsigned long apic_id:32; /* RW */
+ unsigned long rsvd_48_62:15;
+ unsigned long send:1; /* WP */
+ } s;
};
/* ========================================================================= */
/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST */
/* ========================================================================= */
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x009c0
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x9c0
#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL
#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_SHFT 49
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL
#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_MASK 0x7ffe000000000000UL
union uvh_lb_bau_intd_payload_queue_first_u {
- unsigned long v;
- struct uvh_lb_bau_intd_payload_queue_first_s {
- unsigned long rsvd_0_3: 4; /* */
- unsigned long address : 39; /* RW */
- unsigned long rsvd_43_48: 6; /* */
- unsigned long node_id : 14; /* RW */
- unsigned long rsvd_63 : 1; /* */
- } s;
+ unsigned long v;
+ struct uvh_lb_bau_intd_payload_queue_first_s {
+ unsigned long rsvd_0_3:4;
+ unsigned long address:39; /* RW */
+ unsigned long rsvd_43_48:6;
+ unsigned long node_id:14; /* RW */
+ unsigned long rsvd_63:1;
+ } s;
};
/* ========================================================================= */
/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST */
/* ========================================================================= */
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x009c8
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x9c8
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL
union uvh_lb_bau_intd_payload_queue_last_u {
- unsigned long v;
- struct uvh_lb_bau_intd_payload_queue_last_s {
- unsigned long rsvd_0_3: 4; /* */
- unsigned long address : 39; /* RW */
- unsigned long rsvd_43_63: 21; /* */
- } s;
+ unsigned long v;
+ struct uvh_lb_bau_intd_payload_queue_last_s {
+ unsigned long rsvd_0_3:4;
+ unsigned long address:39; /* RW */
+ unsigned long rsvd_43_63:21;
+ } s;
};
/* ========================================================================= */
/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL */
/* ========================================================================= */
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x009d0
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x9d0
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL
union uvh_lb_bau_intd_payload_queue_tail_u {
- unsigned long v;
- struct uvh_lb_bau_intd_payload_queue_tail_s {
- unsigned long rsvd_0_3: 4; /* */
- unsigned long address : 39; /* RW */
- unsigned long rsvd_43_63: 21; /* */
- } s;
+ unsigned long v;
+ struct uvh_lb_bau_intd_payload_queue_tail_s {
+ unsigned long rsvd_0_3:4;
+ unsigned long address:39; /* RW */
+ unsigned long rsvd_43_63:21;
+ } s;
};
/* ========================================================================= */
/* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE */
/* ========================================================================= */
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0x0a68
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0xa68
#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL
#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_SHFT 1
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL
#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_SHFT 2
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL
#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_SHFT 3
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL
#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_SHFT 4
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL
#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_SHFT 5
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL
#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_SHFT 6
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL
#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_SHFT 7
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL
#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_SHFT 8
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL
#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_SHFT 9
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL
#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_SHFT 10
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL
#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_SHFT 11
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL
#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_SHFT 12
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL
#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_SHFT 13
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL
#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_SHFT 14
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL
#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL
#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL
+
union uvh_lb_bau_intd_software_acknowledge_u {
- unsigned long v;
- struct uvh_lb_bau_intd_software_acknowledge_s {
- unsigned long pending_0 : 1; /* RW, W1C */
- unsigned long pending_1 : 1; /* RW, W1C */
- unsigned long pending_2 : 1; /* RW, W1C */
- unsigned long pending_3 : 1; /* RW, W1C */
- unsigned long pending_4 : 1; /* RW, W1C */
- unsigned long pending_5 : 1; /* RW, W1C */
- unsigned long pending_6 : 1; /* RW, W1C */
- unsigned long pending_7 : 1; /* RW, W1C */
- unsigned long timeout_0 : 1; /* RW, W1C */
- unsigned long timeout_1 : 1; /* RW, W1C */
- unsigned long timeout_2 : 1; /* RW, W1C */
- unsigned long timeout_3 : 1; /* RW, W1C */
- unsigned long timeout_4 : 1; /* RW, W1C */
- unsigned long timeout_5 : 1; /* RW, W1C */
- unsigned long timeout_6 : 1; /* RW, W1C */
- unsigned long timeout_7 : 1; /* RW, W1C */
- unsigned long rsvd_16_63: 48; /* */
- } s;
+ unsigned long v;
+ struct uvh_lb_bau_intd_software_acknowledge_s {
+ unsigned long pending_0:1; /* RW, W1C */
+ unsigned long pending_1:1; /* RW, W1C */
+ unsigned long pending_2:1; /* RW, W1C */
+ unsigned long pending_3:1; /* RW, W1C */
+ unsigned long pending_4:1; /* RW, W1C */
+ unsigned long pending_5:1; /* RW, W1C */
+ unsigned long pending_6:1; /* RW, W1C */
+ unsigned long pending_7:1; /* RW, W1C */
+ unsigned long timeout_0:1; /* RW, W1C */
+ unsigned long timeout_1:1; /* RW, W1C */
+ unsigned long timeout_2:1; /* RW, W1C */
+ unsigned long timeout_3:1; /* RW, W1C */
+ unsigned long timeout_4:1; /* RW, W1C */
+ unsigned long timeout_5:1; /* RW, W1C */
+ unsigned long timeout_6:1; /* RW, W1C */
+ unsigned long timeout_7:1; /* RW, W1C */
+ unsigned long rsvd_16_63:48;
+ } s;
};
/* ========================================================================= */
/* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS */
/* ========================================================================= */
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x0000000000320088UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0x0a70
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x0000000000320088UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0xa70
/* ========================================================================= */
/* UVH_LB_BAU_MISC_CONTROL */
/* ========================================================================= */
-#define UVH_LB_BAU_MISC_CONTROL 0x320170UL
-#define UVH_LB_BAU_MISC_CONTROL_32 0x00a10
-
-#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0
-#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL
-#define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8
-#define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL
-#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9
-#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
-#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
-#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
-#define UVH_LB_BAU_MISC_CONTROL_CSI_AGENT_PRESENCE_VECTOR_SHFT 11
-#define UVH_LB_BAU_MISC_CONTROL_CSI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
+#define UVH_LB_BAU_MISC_CONTROL 0x320170UL
+#define UVH_LB_BAU_MISC_CONTROL_32 0xa10
+
+#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0
+#define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8
+#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9
+#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
+#define UVH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11
#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
-#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15
-#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16
-#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
#define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20
-#define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21
-#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22
-#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
#define UVH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23
-#define UVH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
#define UVH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24
-#define UVH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
#define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27
-#define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
#define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
+#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL
+#define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL
+#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
+#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
+#define UVH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
+#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
+#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
+#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
+#define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
+#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
+#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
+#define UVH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
+#define UVH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
+#define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
#define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
-#define UVH_LB_BAU_MISC_CONTROL_FUN_SHFT 48
-#define UVH_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL
+
+#define UV1H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0
+#define UV1H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8
+#define UV1H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9
+#define UV1H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
+#define UV1H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11
+#define UV1H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
+#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15
+#define UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16
+#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20
+#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21
+#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22
+#define UV1H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23
+#define UV1H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24
+#define UV1H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27
+#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
+#define UV1H_LB_BAU_MISC_CONTROL_FUN_SHFT 48
+#define UV1H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL
+#define UV1H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL
+#define UV1H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
+#define UV1H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
+#define UV1H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
+#define UV1H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
+#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
+#define UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
+#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
+#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
+#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
+#define UV1H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
+#define UV1H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
+#define UV1H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
+#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
+#define UV1H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL
+
+#define UV2H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0
+#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8
+#define UV2H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9
+#define UV2H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
+#define UV2H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11
+#define UV2H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15
+#define UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20
+#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21
+#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22
+#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23
+#define UV2H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24
+#define UV2H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_SHFT 29
+#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_SHFT 30
+#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_SHFT 31
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_SHFT 32
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT 33
+#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_SHFT 34
+#define UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT 35
+#define UV2H_LB_BAU_MISC_CONTROL_FUN_SHFT 48
+#define UV2H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL
+#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL
+#define UV2H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
+#define UV2H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
+#define UV2H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
+#define UV2H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
+#define UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
+#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
+#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
+#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
+#define UV2H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_MASK 0x0000000020000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_MASK 0x0000000040000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_MASK 0x0000000080000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_MASK 0x0000000100000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_MASK 0x0000000200000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_MASK 0x0000000400000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_MASK 0x0000000800000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL
union uvh_lb_bau_misc_control_u {
- unsigned long v;
- struct uvh_lb_bau_misc_control_s {
- unsigned long rejection_delay : 8; /* RW */
- unsigned long apic_mode : 1; /* RW */
- unsigned long force_broadcast : 1; /* RW */
- unsigned long force_lock_nop : 1; /* RW */
- unsigned long csi_agent_presence_vector : 3; /* RW */
- unsigned long descriptor_fetch_mode : 1; /* RW */
- unsigned long enable_intd_soft_ack_mode : 1; /* RW */
- unsigned long intd_soft_ack_timeout_period : 4; /* RW */
- unsigned long enable_dual_mapping_mode : 1; /* RW */
- unsigned long vga_io_port_decode_enable : 1; /* RW */
- unsigned long vga_io_port_16_bit_decode : 1; /* RW */
- unsigned long suppress_dest_registration : 1; /* RW */
- unsigned long programmed_initial_priority : 3; /* RW */
- unsigned long use_incoming_priority : 1; /* RW */
- unsigned long enable_programmed_initial_priority : 1; /* RW */
- unsigned long rsvd_29_47 : 19; /* */
- unsigned long fun : 16; /* RW */
- } s;
+ unsigned long v;
+ struct uvh_lb_bau_misc_control_s {
+ unsigned long rejection_delay:8; /* RW */
+ unsigned long apic_mode:1; /* RW */
+ unsigned long force_broadcast:1; /* RW */
+ unsigned long force_lock_nop:1; /* RW */
+ unsigned long qpi_agent_presence_vector:3; /* RW */
+ unsigned long descriptor_fetch_mode:1; /* RW */
+ unsigned long enable_intd_soft_ack_mode:1; /* RW */
+ unsigned long intd_soft_ack_timeout_period:4; /* RW */
+ unsigned long enable_dual_mapping_mode:1; /* RW */
+ unsigned long vga_io_port_decode_enable:1; /* RW */
+ unsigned long vga_io_port_16_bit_decode:1; /* RW */
+ unsigned long suppress_dest_registration:1; /* RW */
+ unsigned long programmed_initial_priority:3; /* RW */
+ unsigned long use_incoming_priority:1; /* RW */
+ unsigned long enable_programmed_initial_priority:1;/* RW */
+ unsigned long rsvd_29_63:35;
+ } s;
+ struct uv1h_lb_bau_misc_control_s {
+ unsigned long rejection_delay:8; /* RW */
+ unsigned long apic_mode:1; /* RW */
+ unsigned long force_broadcast:1; /* RW */
+ unsigned long force_lock_nop:1; /* RW */
+ unsigned long qpi_agent_presence_vector:3; /* RW */
+ unsigned long descriptor_fetch_mode:1; /* RW */
+ unsigned long enable_intd_soft_ack_mode:1; /* RW */
+ unsigned long intd_soft_ack_timeout_period:4; /* RW */
+ unsigned long enable_dual_mapping_mode:1; /* RW */
+ unsigned long vga_io_port_decode_enable:1; /* RW */
+ unsigned long vga_io_port_16_bit_decode:1; /* RW */
+ unsigned long suppress_dest_registration:1; /* RW */
+ unsigned long programmed_initial_priority:3; /* RW */
+ unsigned long use_incoming_priority:1; /* RW */
+ unsigned long enable_programmed_initial_priority:1;/* RW */
+ unsigned long rsvd_29_47:19;
+ unsigned long fun:16; /* RW */
+ } s1;
+ struct uv2h_lb_bau_misc_control_s {
+ unsigned long rejection_delay:8; /* RW */
+ unsigned long apic_mode:1; /* RW */
+ unsigned long force_broadcast:1; /* RW */
+ unsigned long force_lock_nop:1; /* RW */
+ unsigned long qpi_agent_presence_vector:3; /* RW */
+ unsigned long descriptor_fetch_mode:1; /* RW */
+ unsigned long enable_intd_soft_ack_mode:1; /* RW */
+ unsigned long intd_soft_ack_timeout_period:4; /* RW */
+ unsigned long enable_dual_mapping_mode:1; /* RW */
+ unsigned long vga_io_port_decode_enable:1; /* RW */
+ unsigned long vga_io_port_16_bit_decode:1; /* RW */
+ unsigned long suppress_dest_registration:1; /* RW */
+ unsigned long programmed_initial_priority:3; /* RW */
+ unsigned long use_incoming_priority:1; /* RW */
+ unsigned long enable_programmed_initial_priority:1;/* RW */
+ unsigned long enable_automatic_apic_mode_selection:1;/* RW */
+ unsigned long apic_mode_status:1; /* RO */
+ unsigned long suppress_interrupts_to_self:1; /* RW */
+ unsigned long enable_lock_based_system_flush:1;/* RW */
+ unsigned long enable_extended_sb_status:1; /* RW */
+ unsigned long suppress_int_prio_udt_to_self:1;/* RW */
+ unsigned long use_legacy_descriptor_formats:1;/* RW */
+ unsigned long rsvd_36_47:12;
+ unsigned long fun:16; /* RW */
+ } s2;
};
/* ========================================================================= */
/* UVH_LB_BAU_SB_ACTIVATION_CONTROL */
/* ========================================================================= */
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 0x009a8
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_SHFT 0
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_MASK 0x000000000000003fUL
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT 62
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_MASK 0x4000000000000000UL
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_SHFT 63
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_MASK 0x8000000000000000UL
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_SHFT 0
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT 62
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_SHFT 63
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_MASK 0x000000000000003fUL
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_MASK 0x4000000000000000UL
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_MASK 0x8000000000000000UL
union uvh_lb_bau_sb_activation_control_u {
- unsigned long v;
- struct uvh_lb_bau_sb_activation_control_s {
- unsigned long index : 6; /* RW */
- unsigned long rsvd_6_61: 56; /* */
- unsigned long push : 1; /* WP */
- unsigned long init : 1; /* WP */
- } s;
+ unsigned long v;
+ struct uvh_lb_bau_sb_activation_control_s {
+ unsigned long index:6; /* RW */
+ unsigned long rsvd_6_61:56;
+ unsigned long push:1; /* WP */
+ unsigned long init:1; /* WP */
+ } s;
};
/* ========================================================================= */
/* UVH_LB_BAU_SB_ACTIVATION_STATUS_0 */
/* ========================================================================= */
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x009b0
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_SHFT 0
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_MASK 0xffffffffffffffffUL
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_SHFT 0
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_MASK 0xffffffffffffffffUL
union uvh_lb_bau_sb_activation_status_0_u {
- unsigned long v;
- struct uvh_lb_bau_sb_activation_status_0_s {
- unsigned long status : 64; /* RW */
- } s;
+ unsigned long v;
+ struct uvh_lb_bau_sb_activation_status_0_s {
+ unsigned long status:64; /* RW */
+ } s;
};
/* ========================================================================= */
/* UVH_LB_BAU_SB_ACTIVATION_STATUS_1 */
/* ========================================================================= */
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x009b8
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_SHFT 0
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_MASK 0xffffffffffffffffUL
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_SHFT 0
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_MASK 0xffffffffffffffffUL
union uvh_lb_bau_sb_activation_status_1_u {
- unsigned long v;
- struct uvh_lb_bau_sb_activation_status_1_s {
- unsigned long status : 64; /* RW */
- } s;
+ unsigned long v;
+ struct uvh_lb_bau_sb_activation_status_1_s {
+ unsigned long status:64; /* RW */
+ } s;
};
/* ========================================================================= */
/* UVH_LB_BAU_SB_DESCRIPTOR_BASE */
/* ========================================================================= */
-#define UVH_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL
-#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 0x009a0
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0
-#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_SHFT 12
-#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL
-#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 49
-#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0x7ffe000000000000UL
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_SHFT 12
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 49
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0x7ffe000000000000UL
union uvh_lb_bau_sb_descriptor_base_u {
- unsigned long v;
- struct uvh_lb_bau_sb_descriptor_base_s {
- unsigned long rsvd_0_11 : 12; /* */
- unsigned long page_address : 31; /* RW */
- unsigned long rsvd_43_48 : 6; /* */
- unsigned long node_id : 14; /* RW */
- unsigned long rsvd_63 : 1; /* */
- } s;
-};
-
-/* ========================================================================= */
-/* UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK */
-/* ========================================================================= */
-#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK 0x320130UL
-#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK_32 0x009f0
-
-#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_SHFT 0
-#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_MASK 0x00000000ffffffffUL
-
-union uvh_lb_target_physical_apic_id_mask_u {
- unsigned long v;
- struct uvh_lb_target_physical_apic_id_mask_s {
- unsigned long bit_enables : 32; /* RW */
- unsigned long rsvd_32_63 : 32; /* */
+ unsigned long v;
+ struct uvh_lb_bau_sb_descriptor_base_s {
+ unsigned long rsvd_0_11:12;
+ unsigned long page_address:31; /* RW */
+ unsigned long rsvd_43_48:6;
+ unsigned long node_id:14; /* RW */
+ unsigned long rsvd_63:1;
} s;
};
/* ========================================================================= */
/* UVH_NODE_ID */
/* ========================================================================= */
-#define UVH_NODE_ID 0x0UL
-
-#define UVH_NODE_ID_FORCE1_SHFT 0
-#define UVH_NODE_ID_FORCE1_MASK 0x0000000000000001UL
-#define UVH_NODE_ID_MANUFACTURER_SHFT 1
-#define UVH_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL
-#define UVH_NODE_ID_PART_NUMBER_SHFT 12
-#define UVH_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL
-#define UVH_NODE_ID_REVISION_SHFT 28
-#define UVH_NODE_ID_REVISION_MASK 0x00000000f0000000UL
-#define UVH_NODE_ID_NODE_ID_SHFT 32
-#define UVH_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
-#define UVH_NODE_ID_NODES_PER_BIT_SHFT 48
-#define UVH_NODE_ID_NODES_PER_BIT_MASK 0x007f000000000000UL
-#define UVH_NODE_ID_NI_PORT_SHFT 56
-#define UVH_NODE_ID_NI_PORT_MASK 0x0f00000000000000UL
+#define UVH_NODE_ID 0x0UL
+
+#define UVH_NODE_ID_FORCE1_SHFT 0
+#define UVH_NODE_ID_MANUFACTURER_SHFT 1
+#define UVH_NODE_ID_PART_NUMBER_SHFT 12
+#define UVH_NODE_ID_REVISION_SHFT 28
+#define UVH_NODE_ID_NODE_ID_SHFT 32
+#define UVH_NODE_ID_FORCE1_MASK 0x0000000000000001UL
+#define UVH_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL
+#define UVH_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL
+#define UVH_NODE_ID_REVISION_MASK 0x00000000f0000000UL
+#define UVH_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
+
+#define UV1H_NODE_ID_FORCE1_SHFT 0
+#define UV1H_NODE_ID_MANUFACTURER_SHFT 1
+#define UV1H_NODE_ID_PART_NUMBER_SHFT 12
+#define UV1H_NODE_ID_REVISION_SHFT 28
+#define UV1H_NODE_ID_NODE_ID_SHFT 32
+#define UV1H_NODE_ID_NODES_PER_BIT_SHFT 48
+#define UV1H_NODE_ID_NI_PORT_SHFT 56
+#define UV1H_NODE_ID_FORCE1_MASK 0x0000000000000001UL
+#define UV1H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL
+#define UV1H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL
+#define UV1H_NODE_ID_REVISION_MASK 0x00000000f0000000UL
+#define UV1H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
+#define UV1H_NODE_ID_NODES_PER_BIT_MASK 0x007f000000000000UL
+#define UV1H_NODE_ID_NI_PORT_MASK 0x0f00000000000000UL
+
+#define UV2H_NODE_ID_FORCE1_SHFT 0
+#define UV2H_NODE_ID_MANUFACTURER_SHFT 1
+#define UV2H_NODE_ID_PART_NUMBER_SHFT 12
+#define UV2H_NODE_ID_REVISION_SHFT 28
+#define UV2H_NODE_ID_NODE_ID_SHFT 32
+#define UV2H_NODE_ID_NODES_PER_BIT_SHFT 50
+#define UV2H_NODE_ID_NI_PORT_SHFT 57
+#define UV2H_NODE_ID_FORCE1_MASK 0x0000000000000001UL
+#define UV2H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL
+#define UV2H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL
+#define UV2H_NODE_ID_REVISION_MASK 0x00000000f0000000UL
+#define UV2H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
+#define UV2H_NODE_ID_NODES_PER_BIT_MASK 0x01fc000000000000UL
+#define UV2H_NODE_ID_NI_PORT_MASK 0x3e00000000000000UL
union uvh_node_id_u {
- unsigned long v;
- struct uvh_node_id_s {
- unsigned long force1 : 1; /* RO */
- unsigned long manufacturer : 11; /* RO */
- unsigned long part_number : 16; /* RO */
- unsigned long revision : 4; /* RO */
- unsigned long node_id : 15; /* RW */
- unsigned long rsvd_47 : 1; /* */
- unsigned long nodes_per_bit : 7; /* RW */
- unsigned long rsvd_55 : 1; /* */
- unsigned long ni_port : 4; /* RO */
- unsigned long rsvd_60_63 : 4; /* */
- } s;
+ unsigned long v;
+ struct uvh_node_id_s {
+ unsigned long force1:1; /* RO */
+ unsigned long manufacturer:11; /* RO */
+ unsigned long part_number:16; /* RO */
+ unsigned long revision:4; /* RO */
+ unsigned long node_id:15; /* RW */
+ unsigned long rsvd_47_63:17;
+ } s;
+ struct uv1h_node_id_s {
+ unsigned long force1:1; /* RO */
+ unsigned long manufacturer:11; /* RO */
+ unsigned long part_number:16; /* RO */
+ unsigned long revision:4; /* RO */
+ unsigned long node_id:15; /* RW */
+ unsigned long rsvd_47:1;
+ unsigned long nodes_per_bit:7; /* RW */
+ unsigned long rsvd_55:1;
+ unsigned long ni_port:4; /* RO */
+ unsigned long rsvd_60_63:4;
+ } s1;
+ struct uv2h_node_id_s {
+ unsigned long force1:1; /* RO */
+ unsigned long manufacturer:11; /* RO */
+ unsigned long part_number:16; /* RO */
+ unsigned long revision:4; /* RO */
+ unsigned long node_id:15; /* RW */
+ unsigned long rsvd_47_49:3;
+ unsigned long nodes_per_bit:7; /* RO */
+ unsigned long ni_port:5; /* RO */
+ unsigned long rsvd_62_63:2;
+ } s2;
};
/* ========================================================================= */
/* UVH_NODE_PRESENT_TABLE */
/* ========================================================================= */
-#define UVH_NODE_PRESENT_TABLE 0x1400UL
-#define UVH_NODE_PRESENT_TABLE_DEPTH 16
+#define UVH_NODE_PRESENT_TABLE 0x1400UL
+#define UVH_NODE_PRESENT_TABLE_DEPTH 16
-#define UVH_NODE_PRESENT_TABLE_NODES_SHFT 0
-#define UVH_NODE_PRESENT_TABLE_NODES_MASK 0xffffffffffffffffUL
+#define UVH_NODE_PRESENT_TABLE_NODES_SHFT 0
+#define UVH_NODE_PRESENT_TABLE_NODES_MASK 0xffffffffffffffffUL
union uvh_node_present_table_u {
- unsigned long v;
- struct uvh_node_present_table_s {
- unsigned long nodes : 64; /* RW */
- } s;
+ unsigned long v;
+ struct uvh_node_present_table_s {
+ unsigned long nodes:64; /* RW */
+ } s;
};
/* ========================================================================= */
/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR */
/* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL
#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24
-#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL
#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48
-#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL
#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_SHFT 63
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL
#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL
union uvh_rh_gam_alias210_overlay_config_0_mmr_u {
- unsigned long v;
- struct uvh_rh_gam_alias210_overlay_config_0_mmr_s {
- unsigned long rsvd_0_23: 24; /* */
- unsigned long base : 8; /* RW */
- unsigned long rsvd_32_47: 16; /* */
- unsigned long m_alias : 5; /* RW */
- unsigned long rsvd_53_62: 10; /* */
- unsigned long enable : 1; /* RW */
- } s;
+ unsigned long v;
+ struct uvh_rh_gam_alias210_overlay_config_0_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } s;
};
/* ========================================================================= */
/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR */
/* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL
#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24
-#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL
#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48
-#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL
#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_SHFT 63
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL
#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL
union uvh_rh_gam_alias210_overlay_config_1_mmr_u {
- unsigned long v;
- struct uvh_rh_gam_alias210_overlay_config_1_mmr_s {
- unsigned long rsvd_0_23: 24; /* */
- unsigned long base : 8; /* RW */
- unsigned long rsvd_32_47: 16; /* */
- unsigned long m_alias : 5; /* RW */
- unsigned long rsvd_53_62: 10; /* */
- unsigned long enable : 1; /* RW */
- } s;
+ unsigned long v;
+ struct uvh_rh_gam_alias210_overlay_config_1_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } s;
};
/* ========================================================================= */
/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR */
/* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL
#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24
-#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL
#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48
-#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL
#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_SHFT 63
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL
#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL
union uvh_rh_gam_alias210_overlay_config_2_mmr_u {
- unsigned long v;
- struct uvh_rh_gam_alias210_overlay_config_2_mmr_s {
- unsigned long rsvd_0_23: 24; /* */
- unsigned long base : 8; /* RW */
- unsigned long rsvd_32_47: 16; /* */
- unsigned long m_alias : 5; /* RW */
- unsigned long rsvd_53_62: 10; /* */
- unsigned long enable : 1; /* RW */
- } s;
+ unsigned long v;
+ struct uvh_rh_gam_alias210_overlay_config_2_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } s;
};
/* ========================================================================= */
/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR */
/* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL
#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24
#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL
union uvh_rh_gam_alias210_redirect_config_0_mmr_u {
- unsigned long v;
- struct uvh_rh_gam_alias210_redirect_config_0_mmr_s {
- unsigned long rsvd_0_23 : 24; /* */
- unsigned long dest_base : 22; /* RW */
- unsigned long rsvd_46_63: 18; /* */
- } s;
+ unsigned long v;
+ struct uvh_rh_gam_alias210_redirect_config_0_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } s;
};
/* ========================================================================= */
/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR */
/* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL
#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24
#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL
union uvh_rh_gam_alias210_redirect_config_1_mmr_u {
- unsigned long v;
- struct uvh_rh_gam_alias210_redirect_config_1_mmr_s {
- unsigned long rsvd_0_23 : 24; /* */
- unsigned long dest_base : 22; /* RW */
- unsigned long rsvd_46_63: 18; /* */
- } s;
+ unsigned long v;
+ struct uvh_rh_gam_alias210_redirect_config_1_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } s;
};
/* ========================================================================= */
/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR */
/* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL
#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24
#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL
union uvh_rh_gam_alias210_redirect_config_2_mmr_u {
- unsigned long v;
- struct uvh_rh_gam_alias210_redirect_config_2_mmr_s {
- unsigned long rsvd_0_23 : 24; /* */
- unsigned long dest_base : 22; /* RW */
- unsigned long rsvd_46_63: 18; /* */
- } s;
+ unsigned long v;
+ struct uvh_rh_gam_alias210_redirect_config_2_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } s;
};
/* ========================================================================= */
/* UVH_RH_GAM_CONFIG_MMR */
/* ========================================================================= */
-#define UVH_RH_GAM_CONFIG_MMR 0x1600000UL
+#define UVH_RH_GAM_CONFIG_MMR 0x1600000UL
-#define UVH_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0
-#define UVH_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL
-#define UVH_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6
-#define UVH_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL
-#define UVH_RH_GAM_CONFIG_MMR_MMIOL_CFG_SHFT 12
-#define UVH_RH_GAM_CONFIG_MMR_MMIOL_CFG_MASK 0x0000000000001000UL
+#define UVH_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0
+#define UVH_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6
+#define UVH_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL
+#define UVH_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL
+
+#define UV1H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0
+#define UV1H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6
+#define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_SHFT 12
+#define UV1H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL
+#define UV1H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL
+#define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_MASK 0x0000000000001000UL
+
+#define UV2H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0
+#define UV2H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6
+#define UV2H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL
+#define UV2H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL
union uvh_rh_gam_config_mmr_u {
- unsigned long v;
- struct uvh_rh_gam_config_mmr_s {
- unsigned long m_skt : 6; /* RW */
- unsigned long n_skt : 4; /* RW */
- unsigned long rsvd_10_11: 2; /* */
- unsigned long mmiol_cfg : 1; /* RW */
- unsigned long rsvd_13_63: 51; /* */
- } s;
+ unsigned long v;
+ struct uvh_rh_gam_config_mmr_s {
+ unsigned long m_skt:6; /* RW */
+ unsigned long n_skt:4; /* RW */
+ unsigned long rsvd_10_63:54;
+ } s;
+ struct uv1h_rh_gam_config_mmr_s {
+ unsigned long m_skt:6; /* RW */
+ unsigned long n_skt:4; /* RW */
+ unsigned long rsvd_10_11:2;
+ unsigned long mmiol_cfg:1; /* RW */
+ unsigned long rsvd_13_63:51;
+ } s1;
+ struct uv2h_rh_gam_config_mmr_s {
+ unsigned long m_skt:6; /* RW */
+ unsigned long n_skt:4; /* RW */
+ unsigned long rsvd_10_63:54;
+ } s2;
};
/* ========================================================================= */
/* UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR */
/* ========================================================================= */
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL
+
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL
+
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_SHFT 48
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_MASK 0x0001000000000000UL
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_SHFT 48
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_MASK 0x0001000000000000UL
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28
+#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52
+#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL
+#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL
+#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
union uvh_rh_gam_gru_overlay_config_mmr_u {
- unsigned long v;
- struct uvh_rh_gam_gru_overlay_config_mmr_s {
- unsigned long rsvd_0_27: 28; /* */
- unsigned long base : 18; /* RW */
- unsigned long rsvd_46_47: 2; /* */
- unsigned long gr4 : 1; /* RW */
- unsigned long rsvd_49_51: 3; /* */
- unsigned long n_gru : 4; /* RW */
- unsigned long rsvd_56_62: 7; /* */
- unsigned long enable : 1; /* RW */
- } s;
+ unsigned long v;
+ struct uvh_rh_gam_gru_overlay_config_mmr_s {
+ unsigned long rsvd_0_27:28;
+ unsigned long base:18; /* RW */
+ unsigned long rsvd_46_62:17;
+ unsigned long enable:1; /* RW */
+ } s;
+ struct uv1h_rh_gam_gru_overlay_config_mmr_s {
+ unsigned long rsvd_0_27:28;
+ unsigned long base:18; /* RW */
+ unsigned long rsvd_46_47:2;
+ unsigned long gr4:1; /* RW */
+ unsigned long rsvd_49_51:3;
+ unsigned long n_gru:4; /* RW */
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } s1;
+ struct uv2h_rh_gam_gru_overlay_config_mmr_s {
+ unsigned long rsvd_0_27:28;
+ unsigned long base:18; /* RW */
+ unsigned long rsvd_46_51:6;
+ unsigned long n_gru:4; /* RW */
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } s2;
};
/* ========================================================================= */
/* UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR */
/* ========================================================================= */
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL
-
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 30
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003fffc0000000UL
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL
+
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 30
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003fffc0000000UL
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 27
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff8000000UL
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
union uvh_rh_gam_mmioh_overlay_config_mmr_u {
- unsigned long v;
- struct uvh_rh_gam_mmioh_overlay_config_mmr_s {
- unsigned long rsvd_0_29: 30; /* */
- unsigned long base : 16; /* RW */
- unsigned long m_io : 6; /* RW */
- unsigned long n_io : 4; /* RW */
- unsigned long rsvd_56_62: 7; /* */
- unsigned long enable : 1; /* RW */
- } s;
+ unsigned long v;
+ struct uv1h_rh_gam_mmioh_overlay_config_mmr_s {
+ unsigned long rsvd_0_29:30;
+ unsigned long base:16; /* RW */
+ unsigned long m_io:6; /* RW */
+ unsigned long n_io:4; /* RW */
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } s1;
+ struct uv2h_rh_gam_mmioh_overlay_config_mmr_s {
+ unsigned long rsvd_0_26:27;
+ unsigned long base:19; /* RW */
+ unsigned long m_io:6; /* RW */
+ unsigned long n_io:4; /* RW */
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } s2;
};
/* ========================================================================= */
/* UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR */
/* ========================================================================= */
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL
+
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_SHFT 46
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_MASK 0x0000400000000000UL
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
+#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_SHFT 46
+#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
+#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_MASK 0x0000400000000000UL
+#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
+#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
+#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
union uvh_rh_gam_mmr_overlay_config_mmr_u {
- unsigned long v;
- struct uvh_rh_gam_mmr_overlay_config_mmr_s {
- unsigned long rsvd_0_25: 26; /* */
- unsigned long base : 20; /* RW */
- unsigned long dual_hub : 1; /* RW */
- unsigned long rsvd_47_62: 16; /* */
- unsigned long enable : 1; /* RW */
- } s;
+ unsigned long v;
+ struct uvh_rh_gam_mmr_overlay_config_mmr_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:20; /* RW */
+ unsigned long rsvd_46_62:17;
+ unsigned long enable:1; /* RW */
+ } s;
+ struct uv1h_rh_gam_mmr_overlay_config_mmr_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:20; /* RW */
+ unsigned long dual_hub:1; /* RW */
+ unsigned long rsvd_47_62:16;
+ unsigned long enable:1; /* RW */
+ } s1;
+ struct uv2h_rh_gam_mmr_overlay_config_mmr_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:20; /* RW */
+ unsigned long rsvd_46_62:17;
+ unsigned long enable:1; /* RW */
+ } s2;
};
/* ========================================================================= */
/* UVH_RTC */
/* ========================================================================= */
-#define UVH_RTC 0x340000UL
+#define UVH_RTC 0x340000UL
-#define UVH_RTC_REAL_TIME_CLOCK_SHFT 0
-#define UVH_RTC_REAL_TIME_CLOCK_MASK 0x00ffffffffffffffUL
+#define UVH_RTC_REAL_TIME_CLOCK_SHFT 0
+#define UVH_RTC_REAL_TIME_CLOCK_MASK 0x00ffffffffffffffUL
union uvh_rtc_u {
- unsigned long v;
- struct uvh_rtc_s {
- unsigned long real_time_clock : 56; /* RW */
- unsigned long rsvd_56_63 : 8; /* */
- } s;
+ unsigned long v;
+ struct uvh_rtc_s {
+ unsigned long real_time_clock:56; /* RW */
+ unsigned long rsvd_56_63:8;
+ } s;
};
/* ========================================================================= */
/* UVH_RTC1_INT_CONFIG */
/* ========================================================================= */
-#define UVH_RTC1_INT_CONFIG 0x615c0UL
-
-#define UVH_RTC1_INT_CONFIG_VECTOR_SHFT 0
-#define UVH_RTC1_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_RTC1_INT_CONFIG_DM_SHFT 8
-#define UVH_RTC1_INT_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_RTC1_INT_CONFIG_DESTMODE_SHFT 11
-#define UVH_RTC1_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_RTC1_INT_CONFIG_STATUS_SHFT 12
-#define UVH_RTC1_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_RTC1_INT_CONFIG_P_SHFT 13
-#define UVH_RTC1_INT_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_RTC1_INT_CONFIG_T_SHFT 15
-#define UVH_RTC1_INT_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_RTC1_INT_CONFIG_M_SHFT 16
-#define UVH_RTC1_INT_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_RTC1_INT_CONFIG_APIC_ID_SHFT 32
-#define UVH_RTC1_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+#define UVH_RTC1_INT_CONFIG 0x615c0UL
+
+#define UVH_RTC1_INT_CONFIG_VECTOR_SHFT 0
+#define UVH_RTC1_INT_CONFIG_DM_SHFT 8
+#define UVH_RTC1_INT_CONFIG_DESTMODE_SHFT 11
+#define UVH_RTC1_INT_CONFIG_STATUS_SHFT 12
+#define UVH_RTC1_INT_CONFIG_P_SHFT 13
+#define UVH_RTC1_INT_CONFIG_T_SHFT 15
+#define UVH_RTC1_INT_CONFIG_M_SHFT 16
+#define UVH_RTC1_INT_CONFIG_APIC_ID_SHFT 32
+#define UVH_RTC1_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_RTC1_INT_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_RTC1_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_RTC1_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_RTC1_INT_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_RTC1_INT_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_RTC1_INT_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_RTC1_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
union uvh_rtc1_int_config_u {
- unsigned long v;
- struct uvh_rtc1_int_config_s {
- unsigned long vector_ : 8; /* RW */
- unsigned long dm : 3; /* RW */
- unsigned long destmode : 1; /* RW */
- unsigned long status : 1; /* RO */
- unsigned long p : 1; /* RO */
- unsigned long rsvd_14 : 1; /* */
- unsigned long t : 1; /* RO */
- unsigned long m : 1; /* RW */
- unsigned long rsvd_17_31: 15; /* */
- unsigned long apic_id : 32; /* RW */
- } s;
+ unsigned long v;
+ struct uvh_rtc1_int_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_SCRATCH5 */
+/* ========================================================================= */
+#define UVH_SCRATCH5 0x2d0200UL
+#define UVH_SCRATCH5_32 0x778
+
+#define UVH_SCRATCH5_SCRATCH5_SHFT 0
+#define UVH_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL
+
+union uvh_scratch5_u {
+ unsigned long v;
+ struct uvh_scratch5_s {
+ unsigned long scratch5:64; /* RW, W1CS */
+ } s;
+};
+
+/* ========================================================================= */
+/* UV2H_EVENT_OCCURRED2 */
+/* ========================================================================= */
+#define UV2H_EVENT_OCCURRED2 0x70100UL
+#define UV2H_EVENT_OCCURRED2_32 0xb68
+
+#define UV2H_EVENT_OCCURRED2_RTC_0_SHFT 0
+#define UV2H_EVENT_OCCURRED2_RTC_1_SHFT 1
+#define UV2H_EVENT_OCCURRED2_RTC_2_SHFT 2
+#define UV2H_EVENT_OCCURRED2_RTC_3_SHFT 3
+#define UV2H_EVENT_OCCURRED2_RTC_4_SHFT 4
+#define UV2H_EVENT_OCCURRED2_RTC_5_SHFT 5
+#define UV2H_EVENT_OCCURRED2_RTC_6_SHFT 6
+#define UV2H_EVENT_OCCURRED2_RTC_7_SHFT 7
+#define UV2H_EVENT_OCCURRED2_RTC_8_SHFT 8
+#define UV2H_EVENT_OCCURRED2_RTC_9_SHFT 9
+#define UV2H_EVENT_OCCURRED2_RTC_10_SHFT 10
+#define UV2H_EVENT_OCCURRED2_RTC_11_SHFT 11
+#define UV2H_EVENT_OCCURRED2_RTC_12_SHFT 12
+#define UV2H_EVENT_OCCURRED2_RTC_13_SHFT 13
+#define UV2H_EVENT_OCCURRED2_RTC_14_SHFT 14
+#define UV2H_EVENT_OCCURRED2_RTC_15_SHFT 15
+#define UV2H_EVENT_OCCURRED2_RTC_16_SHFT 16
+#define UV2H_EVENT_OCCURRED2_RTC_17_SHFT 17
+#define UV2H_EVENT_OCCURRED2_RTC_18_SHFT 18
+#define UV2H_EVENT_OCCURRED2_RTC_19_SHFT 19
+#define UV2H_EVENT_OCCURRED2_RTC_20_SHFT 20
+#define UV2H_EVENT_OCCURRED2_RTC_21_SHFT 21
+#define UV2H_EVENT_OCCURRED2_RTC_22_SHFT 22
+#define UV2H_EVENT_OCCURRED2_RTC_23_SHFT 23
+#define UV2H_EVENT_OCCURRED2_RTC_24_SHFT 24
+#define UV2H_EVENT_OCCURRED2_RTC_25_SHFT 25
+#define UV2H_EVENT_OCCURRED2_RTC_26_SHFT 26
+#define UV2H_EVENT_OCCURRED2_RTC_27_SHFT 27
+#define UV2H_EVENT_OCCURRED2_RTC_28_SHFT 28
+#define UV2H_EVENT_OCCURRED2_RTC_29_SHFT 29
+#define UV2H_EVENT_OCCURRED2_RTC_30_SHFT 30
+#define UV2H_EVENT_OCCURRED2_RTC_31_SHFT 31
+#define UV2H_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000000001UL
+#define UV2H_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000000002UL
+#define UV2H_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000000004UL
+#define UV2H_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000000008UL
+#define UV2H_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000000010UL
+#define UV2H_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000000020UL
+#define UV2H_EVENT_OCCURRED2_RTC_6_MASK 0x0000000000000040UL
+#define UV2H_EVENT_OCCURRED2_RTC_7_MASK 0x0000000000000080UL
+#define UV2H_EVENT_OCCURRED2_RTC_8_MASK 0x0000000000000100UL
+#define UV2H_EVENT_OCCURRED2_RTC_9_MASK 0x0000000000000200UL
+#define UV2H_EVENT_OCCURRED2_RTC_10_MASK 0x0000000000000400UL
+#define UV2H_EVENT_OCCURRED2_RTC_11_MASK 0x0000000000000800UL
+#define UV2H_EVENT_OCCURRED2_RTC_12_MASK 0x0000000000001000UL
+#define UV2H_EVENT_OCCURRED2_RTC_13_MASK 0x0000000000002000UL
+#define UV2H_EVENT_OCCURRED2_RTC_14_MASK 0x0000000000004000UL
+#define UV2H_EVENT_OCCURRED2_RTC_15_MASK 0x0000000000008000UL
+#define UV2H_EVENT_OCCURRED2_RTC_16_MASK 0x0000000000010000UL
+#define UV2H_EVENT_OCCURRED2_RTC_17_MASK 0x0000000000020000UL
+#define UV2H_EVENT_OCCURRED2_RTC_18_MASK 0x0000000000040000UL
+#define UV2H_EVENT_OCCURRED2_RTC_19_MASK 0x0000000000080000UL
+#define UV2H_EVENT_OCCURRED2_RTC_20_MASK 0x0000000000100000UL
+#define UV2H_EVENT_OCCURRED2_RTC_21_MASK 0x0000000000200000UL
+#define UV2H_EVENT_OCCURRED2_RTC_22_MASK 0x0000000000400000UL
+#define UV2H_EVENT_OCCURRED2_RTC_23_MASK 0x0000000000800000UL
+#define UV2H_EVENT_OCCURRED2_RTC_24_MASK 0x0000000001000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_25_MASK 0x0000000002000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_26_MASK 0x0000000004000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_27_MASK 0x0000000008000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_28_MASK 0x0000000010000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_29_MASK 0x0000000020000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_30_MASK 0x0000000040000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_31_MASK 0x0000000080000000UL
+
+union uv2h_event_occurred2_u {
+ unsigned long v;
+ struct uv2h_event_occurred2_s {
+ unsigned long rtc_0:1; /* RW */
+ unsigned long rtc_1:1; /* RW */
+ unsigned long rtc_2:1; /* RW */
+ unsigned long rtc_3:1; /* RW */
+ unsigned long rtc_4:1; /* RW */
+ unsigned long rtc_5:1; /* RW */
+ unsigned long rtc_6:1; /* RW */
+ unsigned long rtc_7:1; /* RW */
+ unsigned long rtc_8:1; /* RW */
+ unsigned long rtc_9:1; /* RW */
+ unsigned long rtc_10:1; /* RW */
+ unsigned long rtc_11:1; /* RW */
+ unsigned long rtc_12:1; /* RW */
+ unsigned long rtc_13:1; /* RW */
+ unsigned long rtc_14:1; /* RW */
+ unsigned long rtc_15:1; /* RW */
+ unsigned long rtc_16:1; /* RW */
+ unsigned long rtc_17:1; /* RW */
+ unsigned long rtc_18:1; /* RW */
+ unsigned long rtc_19:1; /* RW */
+ unsigned long rtc_20:1; /* RW */
+ unsigned long rtc_21:1; /* RW */
+ unsigned long rtc_22:1; /* RW */
+ unsigned long rtc_23:1; /* RW */
+ unsigned long rtc_24:1; /* RW */
+ unsigned long rtc_25:1; /* RW */
+ unsigned long rtc_26:1; /* RW */
+ unsigned long rtc_27:1; /* RW */
+ unsigned long rtc_28:1; /* RW */
+ unsigned long rtc_29:1; /* RW */
+ unsigned long rtc_30:1; /* RW */
+ unsigned long rtc_31:1; /* RW */
+ unsigned long rsvd_32_63:32;
+ } s1;
+};
+
+/* ========================================================================= */
+/* UV2H_EVENT_OCCURRED2_ALIAS */
+/* ========================================================================= */
+#define UV2H_EVENT_OCCURRED2_ALIAS 0x70108UL
+#define UV2H_EVENT_OCCURRED2_ALIAS_32 0xb70
+
+/* ========================================================================= */
+/* UV2H_LB_BAU_SB_ACTIVATION_STATUS_2 */
+/* ========================================================================= */
+#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2 0x320130UL
+#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x9f0
+
+#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0
+#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL
+
+union uv2h_lb_bau_sb_activation_status_2_u {
+ unsigned long v;
+ struct uv2h_lb_bau_sb_activation_status_2_s {
+ unsigned long aux_error:64; /* RW */
+ } s1;
+};
+
+/* ========================================================================= */
+/* UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK */
+/* ========================================================================= */
+#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK 0x320130UL
+#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_32 0x9f0
+
+#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_SHFT 0
+#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_MASK 0x00000000ffffffffUL
+
+union uv1h_lb_target_physical_apic_id_mask_u {
+ unsigned long v;
+ struct uv1h_lb_target_physical_apic_id_mask_s {
+ unsigned long bit_enables:32; /* RW */
+ unsigned long rsvd_32_63:32;
+ } s1;
};
-#endif /* __ASM_UV_MMRS_X86_H__ */
+#endif /* _ASM_X86_UV_UV_MMRS_H */
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 9064052b73d..bb0522850b7 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -1,20 +1,6 @@
#ifndef _ASM_X86_VDSO_H
#define _ASM_X86_VDSO_H
-#ifdef CONFIG_X86_64
-extern const char VDSO64_PRELINK[];
-
-/*
- * Given a pointer to the vDSO image, find the pointer to VDSO64_name
- * as that symbol is defined in the vDSO sources or linker script.
- */
-#define VDSO64_SYMBOL(base, name) \
-({ \
- extern const char VDSO64_##name[]; \
- (void *)(VDSO64_##name - VDSO64_PRELINK + (unsigned long)(base)); \
-})
-#endif
-
#if defined CONFIG_X86_32 || defined CONFIG_COMPAT
extern const char VDSO32_PRELINK[];
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index 3d61e204826..815285bcace 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -11,10 +11,9 @@ struct vsyscall_gtod_data {
time_t wall_time_sec;
u32 wall_time_nsec;
- int sysctl_enabled;
struct timezone sys_tz;
struct { /* extract of a clocksource struct */
- cycle_t (*vread)(void);
+ int vclock_mode;
cycle_t cycle_last;
cycle_t mask;
u32 mult;
@@ -23,8 +22,6 @@ struct vsyscall_gtod_data {
struct timespec wall_to_monotonic;
struct timespec wall_time_coarse;
};
-extern struct vsyscall_gtod_data __vsyscall_gtod_data
-__section_vsyscall_gtod_data;
extern struct vsyscall_gtod_data vsyscall_gtod_data;
#endif /* _ASM_X86_VGTOD_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 84471b81046..2caf290e989 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -132,6 +132,8 @@ enum vmcs_field {
GUEST_IA32_PAT_HIGH = 0x00002805,
GUEST_IA32_EFER = 0x00002806,
GUEST_IA32_EFER_HIGH = 0x00002807,
+ GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808,
+ GUEST_IA32_PERF_GLOBAL_CTRL_HIGH= 0x00002809,
GUEST_PDPTR0 = 0x0000280a,
GUEST_PDPTR0_HIGH = 0x0000280b,
GUEST_PDPTR1 = 0x0000280c,
@@ -144,6 +146,8 @@ enum vmcs_field {
HOST_IA32_PAT_HIGH = 0x00002c01,
HOST_IA32_EFER = 0x00002c02,
HOST_IA32_EFER_HIGH = 0x00002c03,
+ HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04,
+ HOST_IA32_PERF_GLOBAL_CTRL_HIGH = 0x00002c05,
PIN_BASED_VM_EXEC_CONTROL = 0x00004000,
CPU_BASED_VM_EXEC_CONTROL = 0x00004002,
EXCEPTION_BITMAP = 0x00004004,
@@ -426,4 +430,43 @@ struct vmx_msr_entry {
u64 value;
} __aligned(16);
+/*
+ * Exit Qualifications for entry failure during or after loading guest state
+ */
+#define ENTRY_FAIL_DEFAULT 0
+#define ENTRY_FAIL_PDPTE 2
+#define ENTRY_FAIL_NMI 3
+#define ENTRY_FAIL_VMCS_LINK_PTR 4
+
+/*
+ * VM-instruction error numbers
+ */
+enum vm_instruction_error_number {
+ VMXERR_VMCALL_IN_VMX_ROOT_OPERATION = 1,
+ VMXERR_VMCLEAR_INVALID_ADDRESS = 2,
+ VMXERR_VMCLEAR_VMXON_POINTER = 3,
+ VMXERR_VMLAUNCH_NONCLEAR_VMCS = 4,
+ VMXERR_VMRESUME_NONLAUNCHED_VMCS = 5,
+ VMXERR_VMRESUME_AFTER_VMXOFF = 6,
+ VMXERR_ENTRY_INVALID_CONTROL_FIELD = 7,
+ VMXERR_ENTRY_INVALID_HOST_STATE_FIELD = 8,
+ VMXERR_VMPTRLD_INVALID_ADDRESS = 9,
+ VMXERR_VMPTRLD_VMXON_POINTER = 10,
+ VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID = 11,
+ VMXERR_UNSUPPORTED_VMCS_COMPONENT = 12,
+ VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT = 13,
+ VMXERR_VMXON_IN_VMX_ROOT_OPERATION = 15,
+ VMXERR_ENTRY_INVALID_EXECUTIVE_VMCS_POINTER = 16,
+ VMXERR_ENTRY_NONLAUNCHED_EXECUTIVE_VMCS = 17,
+ VMXERR_ENTRY_EXECUTIVE_VMCS_POINTER_NOT_VMXON_POINTER = 18,
+ VMXERR_VMCALL_NONCLEAR_VMCS = 19,
+ VMXERR_VMCALL_INVALID_VM_EXIT_CONTROL_FIELDS = 20,
+ VMXERR_VMCALL_INCORRECT_MSEG_REVISION_ID = 22,
+ VMXERR_VMXOFF_UNDER_DUAL_MONITOR_TREATMENT_OF_SMIS_AND_SMM = 23,
+ VMXERR_VMCALL_INVALID_SMM_MONITOR_FEATURES = 24,
+ VMXERR_ENTRY_INVALID_VM_EXECUTION_CONTROL_FIELDS_IN_EXECUTIVE_VMCS = 25,
+ VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS = 26,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID = 28,
+};
+
#endif
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index d0983d255fb..60107072c28 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -16,27 +16,15 @@ enum vsyscall_num {
#ifdef __KERNEL__
#include <linux/seqlock.h>
-#define __section_vgetcpu_mode __attribute__ ((unused, __section__ (".vgetcpu_mode"), aligned(16)))
-#define __section_jiffies __attribute__ ((unused, __section__ (".jiffies"), aligned(16)))
-
-/* Definitions for CONFIG_GENERIC_TIME definitions */
-#define __section_vsyscall_gtod_data __attribute__ \
- ((unused, __section__ (".vsyscall_gtod_data"),aligned(16)))
-#define __section_vsyscall_clock __attribute__ \
- ((unused, __section__ (".vsyscall_clock"),aligned(16)))
-#define __vsyscall_fn \
- __attribute__ ((unused, __section__(".vsyscall_fn"))) notrace
-
#define VGETCPU_RDTSCP 1
#define VGETCPU_LSL 2
-extern int __vgetcpu_mode;
-extern volatile unsigned long __jiffies;
-
/* kernel space (writeable) */
extern int vgetcpu_mode;
extern struct timezone sys_tz;
+#include <asm/vvar.h>
+
extern void map_vsyscall(void);
#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
new file mode 100644
index 00000000000..de656ac2af4
--- /dev/null
+++ b/arch/x86/include/asm/vvar.h
@@ -0,0 +1,50 @@
+/*
+ * vvar.h: Shared vDSO/kernel variable declarations
+ * Copyright (c) 2011 Andy Lutomirski
+ * Subject to the GNU General Public License, version 2
+ *
+ * A handful of variables are accessible (read-only) from userspace
+ * code in the vsyscall page and the vdso. They are declared here.
+ * Some other file must define them with DEFINE_VVAR.
+ *
+ * In normal kernel code, they are used like any other variable.
+ * In user code, they are accessed through the VVAR macro.
+ *
+ * These variables live in a page of kernel data that has an extra RO
+ * mapping for userspace. Each variable needs a unique offset within
+ * that page; specify that offset with the DECLARE_VVAR macro. (If
+ * you mess up, the linker will catch it.)
+ */
+
+/* Base address of vvars. This is not ABI. */
+#define VVAR_ADDRESS (-10*1024*1024 - 4096)
+
+#if defined(__VVAR_KERNEL_LDS)
+
+/* The kernel linker script defines its own magic to put vvars in the
+ * right place.
+ */
+#define DECLARE_VVAR(offset, type, name) \
+ EMIT_VVAR(name, offset)
+
+#else
+
+#define DECLARE_VVAR(offset, type, name) \
+ static type const * const vvaraddr_ ## name = \
+ (void *)(VVAR_ADDRESS + (offset));
+
+#define DEFINE_VVAR(type, name) \
+ type name \
+ __attribute__((section(".vvar_" #name), aligned(16)))
+
+#define VVAR(name) (*vvaraddr_ ## name)
+
+#endif
+
+/* DECLARE_VVAR(offset, type, name) */
+
+DECLARE_VVAR(0, volatile unsigned long, jiffies)
+DECLARE_VVAR(16, int, vgetcpu_mode)
+DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data)
+
+#undef DECLARE_VVAR
diff --git a/arch/x86/include/asm/x2apic.h b/arch/x86/include/asm/x2apic.h
new file mode 100644
index 00000000000..6bf5b8e478c
--- /dev/null
+++ b/arch/x86/include/asm/x2apic.h
@@ -0,0 +1,62 @@
+/*
+ * Common bits for X2APIC cluster/physical modes.
+ */
+
+#ifndef _ASM_X86_X2APIC_H
+#define _ASM_X86_X2APIC_H
+
+#include <asm/apic.h>
+#include <asm/ipi.h>
+#include <linux/cpumask.h>
+
+/*
+ * Need to use more than cpu 0, because we need more vectors
+ * when MSI-X are used.
+ */
+static const struct cpumask *x2apic_target_cpus(void)
+{
+ return cpu_online_mask;
+}
+
+static int x2apic_apic_id_registered(void)
+{
+ return 1;
+}
+
+/*
+ * For now each logical cpu is in its own vector allocation domain.
+ */
+static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
+{
+ cpumask_clear(retmask);
+ cpumask_set_cpu(cpu, retmask);
+}
+
+static void
+__x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
+{
+ unsigned long cfg = __prepare_ICR(0, vector, dest);
+ native_x2apic_icr_write(cfg, apicid);
+}
+
+static unsigned int x2apic_get_apic_id(unsigned long id)
+{
+ return id;
+}
+
+static unsigned long x2apic_set_apic_id(unsigned int id)
+{
+ return id;
+}
+
+static int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
+{
+ return initial_apicid >> index_msb;
+}
+
+static void x2apic_send_IPI_self(int vector)
+{
+ apic_write(APIC_SELF_IPI, vector);
+}
+
+#endif /* _ASM_X86_X2APIC_H */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 643ebf2e2ad..d3d859035af 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -68,6 +68,17 @@ struct x86_init_oem {
};
/**
+ * struct x86_init_mapping - platform specific initial kernel pagetable setup
+ * @pagetable_reserve: reserve a range of addresses for kernel pagetable usage
+ *
+ * For more details on the purpose of this hook, look in
+ * init_memory_mapping and the commit that added it.
+ */
+struct x86_init_mapping {
+ void (*pagetable_reserve)(u64 start, u64 end);
+};
+
+/**
* struct x86_init_paging - platform specific paging functions
* @pagetable_setup_start: platform specific pre paging_init() call
* @pagetable_setup_done: platform specific post paging_init() call
@@ -123,6 +134,7 @@ struct x86_init_ops {
struct x86_init_mpparse mpparse;
struct x86_init_irqs irqs;
struct x86_init_oem oem;
+ struct x86_init_mapping mapping;
struct x86_init_paging paging;
struct x86_init_timers timers;
struct x86_init_iommu iommu;
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 8508bfe5229..417777de5a4 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -39,6 +39,8 @@
#include <linux/string.h>
#include <linux/types.h>
+#include <trace/events/xen.h>
+
#include <asm/page.h>
#include <asm/pgtable.h>
@@ -447,11 +449,20 @@ HYPERVISOR_hvm_op(int op, void *arg)
return _hypercall2(unsigned long, hvm_op, op, arg);
}
+static inline int
+HYPERVISOR_tmem_op(
+ struct tmem_op *op)
+{
+ return _hypercall1(int, tmem_op, op);
+}
+
static inline void
MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
{
mcl->op = __HYPERVISOR_fpu_taskswitch;
mcl->args[0] = set;
+
+ trace_xen_mc_entry(mcl, 1);
}
static inline void
@@ -468,6 +479,8 @@ MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va,
mcl->args[2] = new_val.pte >> 32;
mcl->args[3] = flags;
}
+
+ trace_xen_mc_entry(mcl, sizeof(new_val) == sizeof(long) ? 3 : 4);
}
static inline void
@@ -478,6 +491,8 @@ MULTI_grant_table_op(struct multicall_entry *mcl, unsigned int cmd,
mcl->args[0] = cmd;
mcl->args[1] = (unsigned long)uop;
mcl->args[2] = count;
+
+ trace_xen_mc_entry(mcl, 3);
}
static inline void
@@ -497,6 +512,8 @@ MULTI_update_va_mapping_otherdomain(struct multicall_entry *mcl, unsigned long v
mcl->args[3] = flags;
mcl->args[4] = domid;
}
+
+ trace_xen_mc_entry(mcl, sizeof(new_val) == sizeof(long) ? 4 : 5);
}
static inline void
@@ -513,6 +530,8 @@ MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr,
mcl->args[2] = desc.a;
mcl->args[3] = desc.b;
}
+
+ trace_xen_mc_entry(mcl, sizeof(maddr) == sizeof(long) ? 2 : 4);
}
static inline void
@@ -521,6 +540,8 @@ MULTI_memory_op(struct multicall_entry *mcl, unsigned int cmd, void *arg)
mcl->op = __HYPERVISOR_memory_op;
mcl->args[0] = cmd;
mcl->args[1] = (unsigned long)arg;
+
+ trace_xen_mc_entry(mcl, 2);
}
static inline void
@@ -532,6 +553,8 @@ MULTI_mmu_update(struct multicall_entry *mcl, struct mmu_update *req,
mcl->args[1] = count;
mcl->args[2] = (unsigned long)success_count;
mcl->args[3] = domid;
+
+ trace_xen_mc_entry(mcl, 4);
}
static inline void
@@ -543,6 +566,8 @@ MULTI_mmuext_op(struct multicall_entry *mcl, struct mmuext_op *op, int count,
mcl->args[1] = count;
mcl->args[2] = (unsigned long)success_count;
mcl->args[3] = domid;
+
+ trace_xen_mc_entry(mcl, 4);
}
static inline void
@@ -551,6 +576,8 @@ MULTI_set_gdt(struct multicall_entry *mcl, unsigned long *frames, int entries)
mcl->op = __HYPERVISOR_set_gdt;
mcl->args[0] = (unsigned long)frames;
mcl->args[1] = entries;
+
+ trace_xen_mc_entry(mcl, 2);
}
static inline void
@@ -560,6 +587,8 @@ MULTI_stack_switch(struct multicall_entry *mcl,
mcl->op = __HYPERVISOR_stack_switch;
mcl->args[0] = ss;
mcl->args[1] = esp;
+
+ trace_xen_mc_entry(mcl, 2);
}
#endif /* _ASM_X86_XEN_HYPERCALL_H */
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index c61934fbf22..64a619d47d3 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -47,8 +47,9 @@ extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
extern unsigned long set_phys_range_identity(unsigned long pfn_s,
unsigned long pfn_e);
-extern int m2p_add_override(unsigned long mfn, struct page *page);
-extern int m2p_remove_override(struct page *page);
+extern int m2p_add_override(unsigned long mfn, struct page *page,
+ bool clear_pte);
+extern int m2p_remove_override(struct page *page, bool clear_pte);
extern struct page *m2p_find_override(unsigned long mfn);
extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h
index aa862098916..968d57dd54c 100644
--- a/arch/x86/include/asm/xen/pci.h
+++ b/arch/x86/include/asm/xen/pci.h
@@ -14,10 +14,27 @@ static inline int pci_xen_hvm_init(void)
}
#endif
#if defined(CONFIG_XEN_DOM0)
-void __init xen_setup_pirqs(void);
+int __init pci_xen_initial_domain(void);
+int xen_find_device_domain_owner(struct pci_dev *dev);
+int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain);
+int xen_unregister_device_domain_owner(struct pci_dev *dev);
#else
-static inline void __init xen_setup_pirqs(void)
+static inline int __init pci_xen_initial_domain(void)
{
+ return -1;
+}
+static inline int xen_find_device_domain_owner(struct pci_dev *dev)
+{
+ return -1;
+}
+static inline int xen_register_device_domain_owner(struct pci_dev *dev,
+ uint16_t domain)
+{
+ return -1;
+}
+static inline int xen_unregister_device_domain_owner(struct pci_dev *dev)
+{
+ return -1;
}
#endif
diff --git a/arch/x86/include/asm/xen/trace_types.h b/arch/x86/include/asm/xen/trace_types.h
new file mode 100644
index 00000000000..21e1874c0a0
--- /dev/null
+++ b/arch/x86/include/asm/xen/trace_types.h
@@ -0,0 +1,18 @@
+#ifndef _ASM_XEN_TRACE_TYPES_H
+#define _ASM_XEN_TRACE_TYPES_H
+
+enum xen_mc_flush_reason {
+ XEN_MC_FL_NONE, /* explicit flush */
+ XEN_MC_FL_BATCH, /* out of hypercall space */
+ XEN_MC_FL_ARGS, /* out of argument space */
+ XEN_MC_FL_CALLBACK, /* out of callback space */
+};
+
+enum xen_mc_extend_args {
+ XEN_MC_XE_OK,
+ XEN_MC_XE_BAD_OP,
+ XEN_MC_XE_NO_SPACE
+};
+typedef void (*xen_mc_callback_fn_t)(void *);
+
+#endif /* _ASM_XEN_TRACE_TYPES_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 7338ef2218b..04105574c8e 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -24,7 +24,6 @@ endif
nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
CFLAGS_hpet.o := $(nostackp)
-CFLAGS_tsc.o := $(nostackp)
CFLAGS_paravirt.o := $(nostackp)
GCOV_PROFILE_vsyscall_64.o := n
GCOV_PROFILE_hpet.o := n
@@ -36,10 +35,11 @@ obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
obj-y += time.o ioport.o ldt.o dumpstack.o
obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o
obj-$(CONFIG_IRQ_WORK) += irq_work.o
-obj-$(CONFIG_X86_32) += probe_roms_32.o
+obj-y += probe_roms.o
obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
+obj-$(CONFIG_X86_64) += vsyscall_emu_64.o
obj-y += bootflag.o e820.o
obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
@@ -117,9 +117,8 @@ obj-$(CONFIG_OF) += devicetree.o
ifeq ($(CONFIG_X86_64),y)
obj-$(CONFIG_AUDIT) += audit_64.o
- obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o
+ obj-$(CONFIG_GART_IOMMU) += amd_gart_64.o aperture_64.o
obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
- obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o
obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
obj-y += vsmp_64.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 9a966c579af..4558f0d0822 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -970,7 +970,7 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
mp_irq.irqflag = (trigger << 2) | polarity;
mp_irq.srcbus = MP_ISA_BUS;
mp_irq.srcbusirq = bus_irq; /* IRQ */
- mp_irq.dstapic = mp_ioapics[ioapic].apicid; /* APIC ID */
+ mp_irq.dstapic = mpc_ioapic_id(ioapic); /* APIC ID */
mp_irq.dstirq = pin; /* INTIN# */
mp_save_irq(&mp_irq);
@@ -1021,7 +1021,7 @@ void __init mp_config_acpi_legacy_irqs(void)
if (ioapic < 0)
continue;
pin = mp_find_ioapic_pin(ioapic, gsi);
- dstapic = mp_ioapics[ioapic].apicid;
+ dstapic = mpc_ioapic_id(ioapic);
for (idx = 0; idx < mp_irq_entries; idx++) {
struct mpc_intsrc *irq = mp_irqs + idx;
@@ -1082,7 +1082,7 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
mp_irq.srcbus = number;
mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
ioapic = mp_find_ioapic(gsi);
- mp_irq.dstapic = mp_ioapics[ioapic].apicid;
+ mp_irq.dstapic = mpc_ioapic_id(ioapic);
mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
mp_save_irq(&mp_irq);
@@ -1113,7 +1113,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
printk(KERN_ERR "Invalid reference to IOAPIC pin "
- "%d-%d\n", mp_ioapics[ioapic].apicid,
+ "%d-%d\n", mpc_ioapic_id(ioapic),
ioapic_pin);
return gsi;
}
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
index ead21b66311..b4fd836e405 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.S
@@ -28,6 +28,8 @@ pmode_cr3: .long 0 /* Saved %cr3 */
pmode_cr4: .long 0 /* Saved %cr4 */
pmode_efer: .quad 0 /* Saved EFER */
pmode_gdt: .quad 0
+pmode_misc_en: .quad 0 /* Saved MISC_ENABLE MSR */
+pmode_behavior: .long 0 /* Wakeup behavior flags */
realmode_flags: .long 0
real_magic: .long 0
trampoline_segment: .word 0
@@ -91,6 +93,18 @@ wakeup_code:
/* Call the C code */
calll main
+ /* Restore MISC_ENABLE before entering protected mode, in case
+ BIOS decided to clear XD_DISABLE during S3. */
+ movl pmode_behavior, %eax
+ btl $WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE, %eax
+ jnc 1f
+
+ movl pmode_misc_en, %eax
+ movl pmode_misc_en + 4, %edx
+ movl $MSR_IA32_MISC_ENABLE, %ecx
+ wrmsr
+1:
+
/* Do any other stuff... */
#ifndef CONFIG_64BIT
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/kernel/acpi/realmode/wakeup.h
index e1828c07e79..97a29e1430e 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.h
+++ b/arch/x86/kernel/acpi/realmode/wakeup.h
@@ -21,6 +21,9 @@ struct wakeup_header {
u32 pmode_efer_low; /* Protected mode EFER */
u32 pmode_efer_high;
u64 pmode_gdt;
+ u32 pmode_misc_en_low; /* Protected mode MISC_ENABLE */
+ u32 pmode_misc_en_high;
+ u32 pmode_behavior; /* Wakeup routine behavior flags */
u32 realmode_flags;
u32 real_magic;
u16 trampoline_segment; /* segment with trampoline code, 64-bit only */
@@ -39,4 +42,7 @@ extern struct wakeup_header wakeup_header;
#define WAKEUP_HEADER_SIGNATURE 0x51ee1111
#define WAKEUP_END_SIGNATURE 0x65a22c82
+/* Wakeup behavior bits */
+#define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE 0
+
#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index ff93bc1b09c..103b6ab368d 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -77,6 +77,12 @@ int acpi_suspend_lowlevel(void)
header->pmode_cr0 = read_cr0();
header->pmode_cr4 = read_cr4_safe();
+ header->pmode_behavior = 0;
+ if (!rdmsr_safe(MSR_IA32_MISC_ENABLE,
+ &header->pmode_misc_en_low,
+ &header->pmode_misc_en_high))
+ header->pmode_behavior |=
+ (1 << WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE);
header->realmode_flags = acpi_realmode_flags;
header->real_magic = 0x12345678;
@@ -112,11 +118,6 @@ static int __init acpi_sleep_setup(char *str)
#ifdef CONFIG_HIBERNATION
if (strncmp(str, "s4_nohwsig", 10) == 0)
acpi_no_s4_hw_signature();
- if (strncmp(str, "s4_nonvs", 8) == 0) {
- pr_warning("ACPI: acpi_sleep=s4_nonvs is deprecated, "
- "please use acpi_sleep=nonvs instead");
- acpi_nvs_nosave();
- }
#endif
if (strncmp(str, "nonvs", 5) == 0)
acpi_nvs_nosave();
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 4a234677e21..c6382281624 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -14,7 +14,6 @@
#include <asm/pgtable.h>
#include <asm/mce.h>
#include <asm/nmi.h>
-#include <asm/vsyscall.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include <asm/io.h>
@@ -67,17 +66,30 @@ __setup("noreplace-paravirt", setup_noreplace_paravirt);
#define DPRINTK(fmt, args...) if (debug_alternative) \
printk(KERN_DEBUG fmt, args)
+/*
+ * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
+ * that correspond to that nop. Getting from one nop to the next, we
+ * add to the array the offset that is equal to the sum of all sizes of
+ * nops preceding the one we are after.
+ *
+ * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
+ * nice symmetry of sizes of the previous nops.
+ */
#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
-/* Use inline assembly to define this because the nops are defined
- as inline assembly strings in the include files and we cannot
- get them easily into strings. */
-asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nintelnops: "
- GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6
- GENERIC_NOP7 GENERIC_NOP8
- "\t.previous");
-extern const unsigned char intelnops[];
-static const unsigned char *const __initconst_or_module
-intel_nops[ASM_NOP_MAX+1] = {
+static const unsigned char intelnops[] =
+{
+ GENERIC_NOP1,
+ GENERIC_NOP2,
+ GENERIC_NOP3,
+ GENERIC_NOP4,
+ GENERIC_NOP5,
+ GENERIC_NOP6,
+ GENERIC_NOP7,
+ GENERIC_NOP8,
+ GENERIC_NOP5_ATOMIC
+};
+static const unsigned char * const intel_nops[ASM_NOP_MAX+2] =
+{
NULL,
intelnops,
intelnops + 1,
@@ -87,17 +99,25 @@ intel_nops[ASM_NOP_MAX+1] = {
intelnops + 1 + 2 + 3 + 4 + 5,
intelnops + 1 + 2 + 3 + 4 + 5 + 6,
intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+ intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
};
#endif
#ifdef K8_NOP1
-asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk8nops: "
- K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
- K8_NOP7 K8_NOP8
- "\t.previous");
-extern const unsigned char k8nops[];
-static const unsigned char *const __initconst_or_module
-k8_nops[ASM_NOP_MAX+1] = {
+static const unsigned char k8nops[] =
+{
+ K8_NOP1,
+ K8_NOP2,
+ K8_NOP3,
+ K8_NOP4,
+ K8_NOP5,
+ K8_NOP6,
+ K8_NOP7,
+ K8_NOP8,
+ K8_NOP5_ATOMIC
+};
+static const unsigned char * const k8_nops[ASM_NOP_MAX+2] =
+{
NULL,
k8nops,
k8nops + 1,
@@ -107,17 +127,25 @@ k8_nops[ASM_NOP_MAX+1] = {
k8nops + 1 + 2 + 3 + 4 + 5,
k8nops + 1 + 2 + 3 + 4 + 5 + 6,
k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+ k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
};
#endif
#if defined(K7_NOP1) && !defined(CONFIG_X86_64)
-asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk7nops: "
- K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
- K7_NOP7 K7_NOP8
- "\t.previous");
-extern const unsigned char k7nops[];
-static const unsigned char *const __initconst_or_module
-k7_nops[ASM_NOP_MAX+1] = {
+static const unsigned char k7nops[] =
+{
+ K7_NOP1,
+ K7_NOP2,
+ K7_NOP3,
+ K7_NOP4,
+ K7_NOP5,
+ K7_NOP6,
+ K7_NOP7,
+ K7_NOP8,
+ K7_NOP5_ATOMIC
+};
+static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
+{
NULL,
k7nops,
k7nops + 1,
@@ -127,17 +155,25 @@ k7_nops[ASM_NOP_MAX+1] = {
k7nops + 1 + 2 + 3 + 4 + 5,
k7nops + 1 + 2 + 3 + 4 + 5 + 6,
k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+ k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
};
#endif
#ifdef P6_NOP1
-asm("\t" __stringify(__INITRODATA_OR_MODULE) "\np6nops: "
- P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6
- P6_NOP7 P6_NOP8
- "\t.previous");
-extern const unsigned char p6nops[];
-static const unsigned char *const __initconst_or_module
-p6_nops[ASM_NOP_MAX+1] = {
+static const unsigned char __initconst_or_module p6nops[] =
+{
+ P6_NOP1,
+ P6_NOP2,
+ P6_NOP3,
+ P6_NOP4,
+ P6_NOP5,
+ P6_NOP6,
+ P6_NOP7,
+ P6_NOP8,
+ P6_NOP5_ATOMIC
+};
+static const unsigned char * const p6_nops[ASM_NOP_MAX+2] =
+{
NULL,
p6nops,
p6nops + 1,
@@ -147,47 +183,65 @@ p6_nops[ASM_NOP_MAX+1] = {
p6nops + 1 + 2 + 3 + 4 + 5,
p6nops + 1 + 2 + 3 + 4 + 5 + 6,
p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+ p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
};
#endif
+/* Initialize these to a safe default */
#ifdef CONFIG_X86_64
+const unsigned char * const *ideal_nops = p6_nops;
+#else
+const unsigned char * const *ideal_nops = intel_nops;
+#endif
-extern char __vsyscall_0;
-static const unsigned char *const *__init_or_module find_nop_table(void)
+void __init arch_init_ideal_nops(void)
{
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
- boot_cpu_has(X86_FEATURE_NOPL))
- return p6_nops;
- else
- return k8_nops;
-}
-
-#else /* CONFIG_X86_64 */
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ /*
+ * Due to a decoder implementation quirk, some
+ * specific Intel CPUs actually perform better with
+ * the "k8_nops" than with the SDM-recommended NOPs.
+ */
+ if (boot_cpu_data.x86 == 6 &&
+ boot_cpu_data.x86_model >= 0x0f &&
+ boot_cpu_data.x86_model != 0x1c &&
+ boot_cpu_data.x86_model != 0x26 &&
+ boot_cpu_data.x86_model != 0x27 &&
+ boot_cpu_data.x86_model < 0x30) {
+ ideal_nops = k8_nops;
+ } else if (boot_cpu_has(X86_FEATURE_NOPL)) {
+ ideal_nops = p6_nops;
+ } else {
+#ifdef CONFIG_X86_64
+ ideal_nops = k8_nops;
+#else
+ ideal_nops = intel_nops;
+#endif
+ }
-static const unsigned char *const *__init_or_module find_nop_table(void)
-{
- if (boot_cpu_has(X86_FEATURE_K8))
- return k8_nops;
- else if (boot_cpu_has(X86_FEATURE_K7))
- return k7_nops;
- else if (boot_cpu_has(X86_FEATURE_NOPL))
- return p6_nops;
- else
- return intel_nops;
+ default:
+#ifdef CONFIG_X86_64
+ ideal_nops = k8_nops;
+#else
+ if (boot_cpu_has(X86_FEATURE_K8))
+ ideal_nops = k8_nops;
+ else if (boot_cpu_has(X86_FEATURE_K7))
+ ideal_nops = k7_nops;
+ else
+ ideal_nops = intel_nops;
+#endif
+ }
}
-#endif /* CONFIG_X86_64 */
-
/* Use this to add nops to a buffer, then text_poke the whole buffer. */
static void __init_or_module add_nops(void *insns, unsigned int len)
{
- const unsigned char *const *noptable = find_nop_table();
-
while (len > 0) {
unsigned int noplen = len;
if (noplen > ASM_NOP_MAX)
noplen = ASM_NOP_MAX;
- memcpy(insns, noptable[noplen], noplen);
+ memcpy(insns, ideal_nops[noplen], noplen);
insns += noplen;
len -= noplen;
}
@@ -207,29 +261,37 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
struct alt_instr *end)
{
struct alt_instr *a;
+ u8 *instr, *replacement;
u8 insnbuf[MAX_PATCH_LEN];
DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
+ /*
+ * The scan order should be from start to end. A later scanned
+ * alternative code can overwrite a previous scanned alternative code.
+ * Some kernel functions (e.g. memcpy, memset, etc) use this order to
+ * patch code.
+ *
+ * So be careful if you want to change the scan order to any other
+ * order.
+ */
for (a = start; a < end; a++) {
- u8 *instr = a->instr;
+ instr = (u8 *)&a->instr_offset + a->instr_offset;
+ replacement = (u8 *)&a->repl_offset + a->repl_offset;
BUG_ON(a->replacementlen > a->instrlen);
BUG_ON(a->instrlen > sizeof(insnbuf));
BUG_ON(a->cpuid >= NCAPINTS*32);
if (!boot_cpu_has(a->cpuid))
continue;
-#ifdef CONFIG_X86_64
- /* vsyscall code is not mapped yet. resolve it manually. */
- if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) {
- instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0));
- DPRINTK("%s: vsyscall fixup: %p => %p\n",
- __func__, a->instr, instr);
- }
-#endif
- memcpy(insnbuf, a->replacement, a->replacementlen);
+
+ memcpy(insnbuf, replacement, a->replacementlen);
+
+ /* 0xe8 is a relative jump; fix the offset. */
if (*insnbuf == 0xe8 && a->replacementlen == 5)
- *(s32 *)(insnbuf + 1) += a->replacement - a->instr;
+ *(s32 *)(insnbuf + 1) += replacement - instr;
+
add_nops(insnbuf + a->replacementlen,
a->instrlen - a->replacementlen);
+
text_poke_early(instr, insnbuf, a->instrlen);
}
}
@@ -678,29 +740,3 @@ void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n)
wrote_text = 0;
__stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
}
-
-#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
-
-#ifdef CONFIG_X86_64
-unsigned char ideal_nop5[5] = { 0x66, 0x66, 0x66, 0x66, 0x90 };
-#else
-unsigned char ideal_nop5[5] = { 0x3e, 0x8d, 0x74, 0x26, 0x00 };
-#endif
-
-void __init arch_init_ideal_nop5(void)
-{
- /*
- * There is no good nop for all x86 archs. This selection
- * algorithm should be unified with the one in find_nop_table(),
- * but this should be good enough for now.
- *
- * For cases other than the ones below, use the safe (as in
- * always functional) defaults above.
- */
-#ifdef CONFIG_X86_64
- /* Don't use these on 32 bits due to broken virtualizers */
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
- memcpy(ideal_nop5, p6_nops[5], 5);
-#endif
-}
-#endif
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 82ada01625b..8a439d364b9 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -30,7 +30,7 @@
#include <linux/syscore_ops.h>
#include <linux/io.h>
#include <linux/gfp.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <asm/mtrr.h>
#include <asm/pgtable.h>
#include <asm/proto.h>
@@ -81,6 +81,9 @@ static u32 gart_unmapped_entry;
#define AGPEXTERN
#endif
+/* GART can only remap to physical addresses < 1TB */
+#define GART_MAX_PHYS_ADDR (1ULL << 40)
+
/* backdoor interface to AGP driver */
AGPEXTERN int agp_memory_reserved;
AGPEXTERN __u32 *agp_gatt_table;
@@ -212,9 +215,13 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
size_t size, int dir, unsigned long align_mask)
{
unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE);
- unsigned long iommu_page = alloc_iommu(dev, npages, align_mask);
+ unsigned long iommu_page;
int i;
+ if (unlikely(phys_mem + size > GART_MAX_PHYS_ADDR))
+ return bad_dma_addr;
+
+ iommu_page = alloc_iommu(dev, npages, align_mask);
if (iommu_page == -1) {
if (!nonforced_iommu(dev, phys_mem, size))
return phys_mem;
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
deleted file mode 100644
index 57ca7778722..00000000000
--- a/arch/x86/kernel/amd_iommu.c
+++ /dev/null
@@ -1,2635 +0,0 @@
-/*
- * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- * Leo Duran <leo.duran@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#include <linux/pci.h>
-#include <linux/bitmap.h>
-#include <linux/slab.h>
-#include <linux/debugfs.h>
-#include <linux/scatterlist.h>
-#include <linux/dma-mapping.h>
-#include <linux/iommu-helper.h>
-#include <linux/iommu.h>
-#include <asm/proto.h>
-#include <asm/iommu.h>
-#include <asm/gart.h>
-#include <asm/amd_iommu_proto.h>
-#include <asm/amd_iommu_types.h>
-#include <asm/amd_iommu.h>
-
-#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
-
-#define EXIT_LOOP_COUNT 10000000
-
-static DEFINE_RWLOCK(amd_iommu_devtable_lock);
-
-/* A list of preallocated protection domains */
-static LIST_HEAD(iommu_pd_list);
-static DEFINE_SPINLOCK(iommu_pd_list_lock);
-
-/*
- * Domain for untranslated devices - only allocated
- * if iommu=pt passed on kernel cmd line.
- */
-static struct protection_domain *pt_domain;
-
-static struct iommu_ops amd_iommu_ops;
-
-/*
- * general struct to manage commands send to an IOMMU
- */
-struct iommu_cmd {
- u32 data[4];
-};
-
-static void reset_iommu_command_buffer(struct amd_iommu *iommu);
-static void update_domain(struct protection_domain *domain);
-
-/****************************************************************************
- *
- * Helper functions
- *
- ****************************************************************************/
-
-static inline u16 get_device_id(struct device *dev)
-{
- struct pci_dev *pdev = to_pci_dev(dev);
-
- return calc_devid(pdev->bus->number, pdev->devfn);
-}
-
-static struct iommu_dev_data *get_dev_data(struct device *dev)
-{
- return dev->archdata.iommu;
-}
-
-/*
- * In this function the list of preallocated protection domains is traversed to
- * find the domain for a specific device
- */
-static struct dma_ops_domain *find_protection_domain(u16 devid)
-{
- struct dma_ops_domain *entry, *ret = NULL;
- unsigned long flags;
- u16 alias = amd_iommu_alias_table[devid];
-
- if (list_empty(&iommu_pd_list))
- return NULL;
-
- spin_lock_irqsave(&iommu_pd_list_lock, flags);
-
- list_for_each_entry(entry, &iommu_pd_list, list) {
- if (entry->target_dev == devid ||
- entry->target_dev == alias) {
- ret = entry;
- break;
- }
- }
-
- spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
-
- return ret;
-}
-
-/*
- * This function checks if the driver got a valid device from the caller to
- * avoid dereferencing invalid pointers.
- */
-static bool check_device(struct device *dev)
-{
- u16 devid;
-
- if (!dev || !dev->dma_mask)
- return false;
-
- /* No device or no PCI device */
- if (dev->bus != &pci_bus_type)
- return false;
-
- devid = get_device_id(dev);
-
- /* Out of our scope? */
- if (devid > amd_iommu_last_bdf)
- return false;
-
- if (amd_iommu_rlookup_table[devid] == NULL)
- return false;
-
- return true;
-}
-
-static int iommu_init_device(struct device *dev)
-{
- struct iommu_dev_data *dev_data;
- struct pci_dev *pdev;
- u16 devid, alias;
-
- if (dev->archdata.iommu)
- return 0;
-
- dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
- if (!dev_data)
- return -ENOMEM;
-
- dev_data->dev = dev;
-
- devid = get_device_id(dev);
- alias = amd_iommu_alias_table[devid];
- pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff);
- if (pdev)
- dev_data->alias = &pdev->dev;
-
- atomic_set(&dev_data->bind, 0);
-
- dev->archdata.iommu = dev_data;
-
-
- return 0;
-}
-
-static void iommu_uninit_device(struct device *dev)
-{
- kfree(dev->archdata.iommu);
-}
-
-void __init amd_iommu_uninit_devices(void)
-{
- struct pci_dev *pdev = NULL;
-
- for_each_pci_dev(pdev) {
-
- if (!check_device(&pdev->dev))
- continue;
-
- iommu_uninit_device(&pdev->dev);
- }
-}
-
-int __init amd_iommu_init_devices(void)
-{
- struct pci_dev *pdev = NULL;
- int ret = 0;
-
- for_each_pci_dev(pdev) {
-
- if (!check_device(&pdev->dev))
- continue;
-
- ret = iommu_init_device(&pdev->dev);
- if (ret)
- goto out_free;
- }
-
- return 0;
-
-out_free:
-
- amd_iommu_uninit_devices();
-
- return ret;
-}
-#ifdef CONFIG_AMD_IOMMU_STATS
-
-/*
- * Initialization code for statistics collection
- */
-
-DECLARE_STATS_COUNTER(compl_wait);
-DECLARE_STATS_COUNTER(cnt_map_single);
-DECLARE_STATS_COUNTER(cnt_unmap_single);
-DECLARE_STATS_COUNTER(cnt_map_sg);
-DECLARE_STATS_COUNTER(cnt_unmap_sg);
-DECLARE_STATS_COUNTER(cnt_alloc_coherent);
-DECLARE_STATS_COUNTER(cnt_free_coherent);
-DECLARE_STATS_COUNTER(cross_page);
-DECLARE_STATS_COUNTER(domain_flush_single);
-DECLARE_STATS_COUNTER(domain_flush_all);
-DECLARE_STATS_COUNTER(alloced_io_mem);
-DECLARE_STATS_COUNTER(total_map_requests);
-
-static struct dentry *stats_dir;
-static struct dentry *de_fflush;
-
-static void amd_iommu_stats_add(struct __iommu_counter *cnt)
-{
- if (stats_dir == NULL)
- return;
-
- cnt->dent = debugfs_create_u64(cnt->name, 0444, stats_dir,
- &cnt->value);
-}
-
-static void amd_iommu_stats_init(void)
-{
- stats_dir = debugfs_create_dir("amd-iommu", NULL);
- if (stats_dir == NULL)
- return;
-
- de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir,
- (u32 *)&amd_iommu_unmap_flush);
-
- amd_iommu_stats_add(&compl_wait);
- amd_iommu_stats_add(&cnt_map_single);
- amd_iommu_stats_add(&cnt_unmap_single);
- amd_iommu_stats_add(&cnt_map_sg);
- amd_iommu_stats_add(&cnt_unmap_sg);
- amd_iommu_stats_add(&cnt_alloc_coherent);
- amd_iommu_stats_add(&cnt_free_coherent);
- amd_iommu_stats_add(&cross_page);
- amd_iommu_stats_add(&domain_flush_single);
- amd_iommu_stats_add(&domain_flush_all);
- amd_iommu_stats_add(&alloced_io_mem);
- amd_iommu_stats_add(&total_map_requests);
-}
-
-#endif
-
-/****************************************************************************
- *
- * Interrupt handling functions
- *
- ****************************************************************************/
-
-static void dump_dte_entry(u16 devid)
-{
- int i;
-
- for (i = 0; i < 8; ++i)
- pr_err("AMD-Vi: DTE[%d]: %08x\n", i,
- amd_iommu_dev_table[devid].data[i]);
-}
-
-static void dump_command(unsigned long phys_addr)
-{
- struct iommu_cmd *cmd = phys_to_virt(phys_addr);
- int i;
-
- for (i = 0; i < 4; ++i)
- pr_err("AMD-Vi: CMD[%d]: %08x\n", i, cmd->data[i]);
-}
-
-static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
-{
- u32 *event = __evt;
- int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK;
- int devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
- int domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK;
- int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
- u64 address = (u64)(((u64)event[3]) << 32) | event[2];
-
- printk(KERN_ERR "AMD-Vi: Event logged [");
-
- switch (type) {
- case EVENT_TYPE_ILL_DEV:
- printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x "
- "address=0x%016llx flags=0x%04x]\n",
- PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
- address, flags);
- dump_dte_entry(devid);
- break;
- case EVENT_TYPE_IO_FAULT:
- printk("IO_PAGE_FAULT device=%02x:%02x.%x "
- "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
- PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
- domid, address, flags);
- break;
- case EVENT_TYPE_DEV_TAB_ERR:
- printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
- "address=0x%016llx flags=0x%04x]\n",
- PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
- address, flags);
- break;
- case EVENT_TYPE_PAGE_TAB_ERR:
- printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
- "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
- PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
- domid, address, flags);
- break;
- case EVENT_TYPE_ILL_CMD:
- printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
- iommu->reset_in_progress = true;
- reset_iommu_command_buffer(iommu);
- dump_command(address);
- break;
- case EVENT_TYPE_CMD_HARD_ERR:
- printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
- "flags=0x%04x]\n", address, flags);
- break;
- case EVENT_TYPE_IOTLB_INV_TO:
- printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x "
- "address=0x%016llx]\n",
- PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
- address);
- break;
- case EVENT_TYPE_INV_DEV_REQ:
- printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x "
- "address=0x%016llx flags=0x%04x]\n",
- PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
- address, flags);
- break;
- default:
- printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type);
- }
-}
-
-static void iommu_poll_events(struct amd_iommu *iommu)
-{
- u32 head, tail;
- unsigned long flags;
-
- spin_lock_irqsave(&iommu->lock, flags);
-
- head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
- tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
-
- while (head != tail) {
- iommu_print_event(iommu, iommu->evt_buf + head);
- head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
- }
-
- writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
-
- spin_unlock_irqrestore(&iommu->lock, flags);
-}
-
-irqreturn_t amd_iommu_int_handler(int irq, void *data)
-{
- struct amd_iommu *iommu;
-
- for_each_iommu(iommu)
- iommu_poll_events(iommu);
-
- return IRQ_HANDLED;
-}
-
-/****************************************************************************
- *
- * IOMMU command queuing functions
- *
- ****************************************************************************/
-
-/*
- * Writes the command to the IOMMUs command buffer and informs the
- * hardware about the new command. Must be called with iommu->lock held.
- */
-static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
-{
- u32 tail, head;
- u8 *target;
-
- WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
- tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
- target = iommu->cmd_buf + tail;
- memcpy_toio(target, cmd, sizeof(*cmd));
- tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
- head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
- if (tail == head)
- return -ENOMEM;
- writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
-
- return 0;
-}
-
-/*
- * General queuing function for commands. Takes iommu->lock and calls
- * __iommu_queue_command().
- */
-static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
-{
- unsigned long flags;
- int ret;
-
- spin_lock_irqsave(&iommu->lock, flags);
- ret = __iommu_queue_command(iommu, cmd);
- if (!ret)
- iommu->need_sync = true;
- spin_unlock_irqrestore(&iommu->lock, flags);
-
- return ret;
-}
-
-/*
- * This function waits until an IOMMU has completed a completion
- * wait command
- */
-static void __iommu_wait_for_completion(struct amd_iommu *iommu)
-{
- int ready = 0;
- unsigned status = 0;
- unsigned long i = 0;
-
- INC_STATS_COUNTER(compl_wait);
-
- while (!ready && (i < EXIT_LOOP_COUNT)) {
- ++i;
- /* wait for the bit to become one */
- status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
- ready = status & MMIO_STATUS_COM_WAIT_INT_MASK;
- }
-
- /* set bit back to zero */
- status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
- writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
-
- if (unlikely(i == EXIT_LOOP_COUNT))
- iommu->reset_in_progress = true;
-}
-
-/*
- * This function queues a completion wait command into the command
- * buffer of an IOMMU
- */
-static int __iommu_completion_wait(struct amd_iommu *iommu)
-{
- struct iommu_cmd cmd;
-
- memset(&cmd, 0, sizeof(cmd));
- cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
- CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
-
- return __iommu_queue_command(iommu, &cmd);
-}
-
-/*
- * This function is called whenever we need to ensure that the IOMMU has
- * completed execution of all commands we sent. It sends a
- * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs
- * us about that by writing a value to a physical address we pass with
- * the command.
- */
-static int iommu_completion_wait(struct amd_iommu *iommu)
-{
- int ret = 0;
- unsigned long flags;
-
- spin_lock_irqsave(&iommu->lock, flags);
-
- if (!iommu->need_sync)
- goto out;
-
- ret = __iommu_completion_wait(iommu);
-
- iommu->need_sync = false;
-
- if (ret)
- goto out;
-
- __iommu_wait_for_completion(iommu);
-
-out:
- spin_unlock_irqrestore(&iommu->lock, flags);
-
- if (iommu->reset_in_progress)
- reset_iommu_command_buffer(iommu);
-
- return 0;
-}
-
-static void iommu_flush_complete(struct protection_domain *domain)
-{
- int i;
-
- for (i = 0; i < amd_iommus_present; ++i) {
- if (!domain->dev_iommu[i])
- continue;
-
- /*
- * Devices of this domain are behind this IOMMU
- * We need to wait for completion of all commands.
- */
- iommu_completion_wait(amd_iommus[i]);
- }
-}
-
-/*
- * Command send function for invalidating a device table entry
- */
-static int iommu_flush_device(struct device *dev)
-{
- struct amd_iommu *iommu;
- struct iommu_cmd cmd;
- u16 devid;
-
- devid = get_device_id(dev);
- iommu = amd_iommu_rlookup_table[devid];
-
- /* Build command */
- memset(&cmd, 0, sizeof(cmd));
- CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
- cmd.data[0] = devid;
-
- return iommu_queue_command(iommu, &cmd);
-}
-
-static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
- u16 domid, int pde, int s)
-{
- memset(cmd, 0, sizeof(*cmd));
- address &= PAGE_MASK;
- CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
- cmd->data[1] |= domid;
- cmd->data[2] = lower_32_bits(address);
- cmd->data[3] = upper_32_bits(address);
- if (s) /* size bit - we flush more than one 4kb page */
- cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
- if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
- cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
-}
-
-/*
- * Generic command send function for invalidaing TLB entries
- */
-static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
- u64 address, u16 domid, int pde, int s)
-{
- struct iommu_cmd cmd;
- int ret;
-
- __iommu_build_inv_iommu_pages(&cmd, address, domid, pde, s);
-
- ret = iommu_queue_command(iommu, &cmd);
-
- return ret;
-}
-
-/*
- * TLB invalidation function which is called from the mapping functions.
- * It invalidates a single PTE if the range to flush is within a single
- * page. Otherwise it flushes the whole TLB of the IOMMU.
- */
-static void __iommu_flush_pages(struct protection_domain *domain,
- u64 address, size_t size, int pde)
-{
- int s = 0, i;
- unsigned long pages = iommu_num_pages(address, size, PAGE_SIZE);
-
- address &= PAGE_MASK;
-
- if (pages > 1) {
- /*
- * If we have to flush more than one page, flush all
- * TLB entries for this domain
- */
- address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
- s = 1;
- }
-
-
- for (i = 0; i < amd_iommus_present; ++i) {
- if (!domain->dev_iommu[i])
- continue;
-
- /*
- * Devices of this domain are behind this IOMMU
- * We need a TLB flush
- */
- iommu_queue_inv_iommu_pages(amd_iommus[i], address,
- domain->id, pde, s);
- }
-
- return;
-}
-
-static void iommu_flush_pages(struct protection_domain *domain,
- u64 address, size_t size)
-{
- __iommu_flush_pages(domain, address, size, 0);
-}
-
-/* Flush the whole IO/TLB for a given protection domain */
-static void iommu_flush_tlb(struct protection_domain *domain)
-{
- __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
-}
-
-/* Flush the whole IO/TLB for a given protection domain - including PDE */
-static void iommu_flush_tlb_pde(struct protection_domain *domain)
-{
- __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
-}
-
-
-/*
- * This function flushes the DTEs for all devices in domain
- */
-static void iommu_flush_domain_devices(struct protection_domain *domain)
-{
- struct iommu_dev_data *dev_data;
- unsigned long flags;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- list_for_each_entry(dev_data, &domain->dev_list, list)
- iommu_flush_device(dev_data->dev);
-
- spin_unlock_irqrestore(&domain->lock, flags);
-}
-
-static void iommu_flush_all_domain_devices(void)
-{
- struct protection_domain *domain;
- unsigned long flags;
-
- spin_lock_irqsave(&amd_iommu_pd_lock, flags);
-
- list_for_each_entry(domain, &amd_iommu_pd_list, list) {
- iommu_flush_domain_devices(domain);
- iommu_flush_complete(domain);
- }
-
- spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
-}
-
-void amd_iommu_flush_all_devices(void)
-{
- iommu_flush_all_domain_devices();
-}
-
-/*
- * This function uses heavy locking and may disable irqs for some time. But
- * this is no issue because it is only called during resume.
- */
-void amd_iommu_flush_all_domains(void)
-{
- struct protection_domain *domain;
- unsigned long flags;
-
- spin_lock_irqsave(&amd_iommu_pd_lock, flags);
-
- list_for_each_entry(domain, &amd_iommu_pd_list, list) {
- spin_lock(&domain->lock);
- iommu_flush_tlb_pde(domain);
- iommu_flush_complete(domain);
- spin_unlock(&domain->lock);
- }
-
- spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
-}
-
-static void reset_iommu_command_buffer(struct amd_iommu *iommu)
-{
- pr_err("AMD-Vi: Resetting IOMMU command buffer\n");
-
- if (iommu->reset_in_progress)
- panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n");
-
- amd_iommu_reset_cmd_buffer(iommu);
- amd_iommu_flush_all_devices();
- amd_iommu_flush_all_domains();
-
- iommu->reset_in_progress = false;
-}
-
-/****************************************************************************
- *
- * The functions below are used the create the page table mappings for
- * unity mapped regions.
- *
- ****************************************************************************/
-
-/*
- * This function is used to add another level to an IO page table. Adding
- * another level increases the size of the address space by 9 bits to a size up
- * to 64 bits.
- */
-static bool increase_address_space(struct protection_domain *domain,
- gfp_t gfp)
-{
- u64 *pte;
-
- if (domain->mode == PAGE_MODE_6_LEVEL)
- /* address space already 64 bit large */
- return false;
-
- pte = (void *)get_zeroed_page(gfp);
- if (!pte)
- return false;
-
- *pte = PM_LEVEL_PDE(domain->mode,
- virt_to_phys(domain->pt_root));
- domain->pt_root = pte;
- domain->mode += 1;
- domain->updated = true;
-
- return true;
-}
-
-static u64 *alloc_pte(struct protection_domain *domain,
- unsigned long address,
- unsigned long page_size,
- u64 **pte_page,
- gfp_t gfp)
-{
- int level, end_lvl;
- u64 *pte, *page;
-
- BUG_ON(!is_power_of_2(page_size));
-
- while (address > PM_LEVEL_SIZE(domain->mode))
- increase_address_space(domain, gfp);
-
- level = domain->mode - 1;
- pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
- address = PAGE_SIZE_ALIGN(address, page_size);
- end_lvl = PAGE_SIZE_LEVEL(page_size);
-
- while (level > end_lvl) {
- if (!IOMMU_PTE_PRESENT(*pte)) {
- page = (u64 *)get_zeroed_page(gfp);
- if (!page)
- return NULL;
- *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
- }
-
- /* No level skipping support yet */
- if (PM_PTE_LEVEL(*pte) != level)
- return NULL;
-
- level -= 1;
-
- pte = IOMMU_PTE_PAGE(*pte);
-
- if (pte_page && level == end_lvl)
- *pte_page = pte;
-
- pte = &pte[PM_LEVEL_INDEX(level, address)];
- }
-
- return pte;
-}
-
-/*
- * This function checks if there is a PTE for a given dma address. If
- * there is one, it returns the pointer to it.
- */
-static u64 *fetch_pte(struct protection_domain *domain, unsigned long address)
-{
- int level;
- u64 *pte;
-
- if (address > PM_LEVEL_SIZE(domain->mode))
- return NULL;
-
- level = domain->mode - 1;
- pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
-
- while (level > 0) {
-
- /* Not Present */
- if (!IOMMU_PTE_PRESENT(*pte))
- return NULL;
-
- /* Large PTE */
- if (PM_PTE_LEVEL(*pte) == 0x07) {
- unsigned long pte_mask, __pte;
-
- /*
- * If we have a series of large PTEs, make
- * sure to return a pointer to the first one.
- */
- pte_mask = PTE_PAGE_SIZE(*pte);
- pte_mask = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1);
- __pte = ((unsigned long)pte) & pte_mask;
-
- return (u64 *)__pte;
- }
-
- /* No level skipping support yet */
- if (PM_PTE_LEVEL(*pte) != level)
- return NULL;
-
- level -= 1;
-
- /* Walk to the next level */
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[PM_LEVEL_INDEX(level, address)];
- }
-
- return pte;
-}
-
-/*
- * Generic mapping functions. It maps a physical address into a DMA
- * address space. It allocates the page table pages if necessary.
- * In the future it can be extended to a generic mapping function
- * supporting all features of AMD IOMMU page tables like level skipping
- * and full 64 bit address spaces.
- */
-static int iommu_map_page(struct protection_domain *dom,
- unsigned long bus_addr,
- unsigned long phys_addr,
- int prot,
- unsigned long page_size)
-{
- u64 __pte, *pte;
- int i, count;
-
- if (!(prot & IOMMU_PROT_MASK))
- return -EINVAL;
-
- bus_addr = PAGE_ALIGN(bus_addr);
- phys_addr = PAGE_ALIGN(phys_addr);
- count = PAGE_SIZE_PTE_COUNT(page_size);
- pte = alloc_pte(dom, bus_addr, page_size, NULL, GFP_KERNEL);
-
- for (i = 0; i < count; ++i)
- if (IOMMU_PTE_PRESENT(pte[i]))
- return -EBUSY;
-
- if (page_size > PAGE_SIZE) {
- __pte = PAGE_SIZE_PTE(phys_addr, page_size);
- __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC;
- } else
- __pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC;
-
- if (prot & IOMMU_PROT_IR)
- __pte |= IOMMU_PTE_IR;
- if (prot & IOMMU_PROT_IW)
- __pte |= IOMMU_PTE_IW;
-
- for (i = 0; i < count; ++i)
- pte[i] = __pte;
-
- update_domain(dom);
-
- return 0;
-}
-
-static unsigned long iommu_unmap_page(struct protection_domain *dom,
- unsigned long bus_addr,
- unsigned long page_size)
-{
- unsigned long long unmap_size, unmapped;
- u64 *pte;
-
- BUG_ON(!is_power_of_2(page_size));
-
- unmapped = 0;
-
- while (unmapped < page_size) {
-
- pte = fetch_pte(dom, bus_addr);
-
- if (!pte) {
- /*
- * No PTE for this address
- * move forward in 4kb steps
- */
- unmap_size = PAGE_SIZE;
- } else if (PM_PTE_LEVEL(*pte) == 0) {
- /* 4kb PTE found for this address */
- unmap_size = PAGE_SIZE;
- *pte = 0ULL;
- } else {
- int count, i;
-
- /* Large PTE found which maps this address */
- unmap_size = PTE_PAGE_SIZE(*pte);
- count = PAGE_SIZE_PTE_COUNT(unmap_size);
- for (i = 0; i < count; i++)
- pte[i] = 0ULL;
- }
-
- bus_addr = (bus_addr & ~(unmap_size - 1)) + unmap_size;
- unmapped += unmap_size;
- }
-
- BUG_ON(!is_power_of_2(unmapped));
-
- return unmapped;
-}
-
-/*
- * This function checks if a specific unity mapping entry is needed for
- * this specific IOMMU.
- */
-static int iommu_for_unity_map(struct amd_iommu *iommu,
- struct unity_map_entry *entry)
-{
- u16 bdf, i;
-
- for (i = entry->devid_start; i <= entry->devid_end; ++i) {
- bdf = amd_iommu_alias_table[i];
- if (amd_iommu_rlookup_table[bdf] == iommu)
- return 1;
- }
-
- return 0;
-}
-
-/*
- * This function actually applies the mapping to the page table of the
- * dma_ops domain.
- */
-static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
- struct unity_map_entry *e)
-{
- u64 addr;
- int ret;
-
- for (addr = e->address_start; addr < e->address_end;
- addr += PAGE_SIZE) {
- ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot,
- PAGE_SIZE);
- if (ret)
- return ret;
- /*
- * if unity mapping is in aperture range mark the page
- * as allocated in the aperture
- */
- if (addr < dma_dom->aperture_size)
- __set_bit(addr >> PAGE_SHIFT,
- dma_dom->aperture[0]->bitmap);
- }
-
- return 0;
-}
-
-/*
- * Init the unity mappings for a specific IOMMU in the system
- *
- * Basically iterates over all unity mapping entries and applies them to
- * the default domain DMA of that IOMMU if necessary.
- */
-static int iommu_init_unity_mappings(struct amd_iommu *iommu)
-{
- struct unity_map_entry *entry;
- int ret;
-
- list_for_each_entry(entry, &amd_iommu_unity_map, list) {
- if (!iommu_for_unity_map(iommu, entry))
- continue;
- ret = dma_ops_unity_map(iommu->default_dom, entry);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-/*
- * Inits the unity mappings required for a specific device
- */
-static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
- u16 devid)
-{
- struct unity_map_entry *e;
- int ret;
-
- list_for_each_entry(e, &amd_iommu_unity_map, list) {
- if (!(devid >= e->devid_start && devid <= e->devid_end))
- continue;
- ret = dma_ops_unity_map(dma_dom, e);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-/****************************************************************************
- *
- * The next functions belong to the address allocator for the dma_ops
- * interface functions. They work like the allocators in the other IOMMU
- * drivers. Its basically a bitmap which marks the allocated pages in
- * the aperture. Maybe it could be enhanced in the future to a more
- * efficient allocator.
- *
- ****************************************************************************/
-
-/*
- * The address allocator core functions.
- *
- * called with domain->lock held
- */
-
-/*
- * Used to reserve address ranges in the aperture (e.g. for exclusion
- * ranges.
- */
-static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
- unsigned long start_page,
- unsigned int pages)
-{
- unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
-
- if (start_page + pages > last_page)
- pages = last_page - start_page;
-
- for (i = start_page; i < start_page + pages; ++i) {
- int index = i / APERTURE_RANGE_PAGES;
- int page = i % APERTURE_RANGE_PAGES;
- __set_bit(page, dom->aperture[index]->bitmap);
- }
-}
-
-/*
- * This function is used to add a new aperture range to an existing
- * aperture in case of dma_ops domain allocation or address allocation
- * failure.
- */
-static int alloc_new_range(struct dma_ops_domain *dma_dom,
- bool populate, gfp_t gfp)
-{
- int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
- struct amd_iommu *iommu;
- unsigned long i;
-
-#ifdef CONFIG_IOMMU_STRESS
- populate = false;
-#endif
-
- if (index >= APERTURE_MAX_RANGES)
- return -ENOMEM;
-
- dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp);
- if (!dma_dom->aperture[index])
- return -ENOMEM;
-
- dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp);
- if (!dma_dom->aperture[index]->bitmap)
- goto out_free;
-
- dma_dom->aperture[index]->offset = dma_dom->aperture_size;
-
- if (populate) {
- unsigned long address = dma_dom->aperture_size;
- int i, num_ptes = APERTURE_RANGE_PAGES / 512;
- u64 *pte, *pte_page;
-
- for (i = 0; i < num_ptes; ++i) {
- pte = alloc_pte(&dma_dom->domain, address, PAGE_SIZE,
- &pte_page, gfp);
- if (!pte)
- goto out_free;
-
- dma_dom->aperture[index]->pte_pages[i] = pte_page;
-
- address += APERTURE_RANGE_SIZE / 64;
- }
- }
-
- dma_dom->aperture_size += APERTURE_RANGE_SIZE;
-
- /* Initialize the exclusion range if necessary */
- for_each_iommu(iommu) {
- if (iommu->exclusion_start &&
- iommu->exclusion_start >= dma_dom->aperture[index]->offset
- && iommu->exclusion_start < dma_dom->aperture_size) {
- unsigned long startpage;
- int pages = iommu_num_pages(iommu->exclusion_start,
- iommu->exclusion_length,
- PAGE_SIZE);
- startpage = iommu->exclusion_start >> PAGE_SHIFT;
- dma_ops_reserve_addresses(dma_dom, startpage, pages);
- }
- }
-
- /*
- * Check for areas already mapped as present in the new aperture
- * range and mark those pages as reserved in the allocator. Such
- * mappings may already exist as a result of requested unity
- * mappings for devices.
- */
- for (i = dma_dom->aperture[index]->offset;
- i < dma_dom->aperture_size;
- i += PAGE_SIZE) {
- u64 *pte = fetch_pte(&dma_dom->domain, i);
- if (!pte || !IOMMU_PTE_PRESENT(*pte))
- continue;
-
- dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1);
- }
-
- update_domain(&dma_dom->domain);
-
- return 0;
-
-out_free:
- update_domain(&dma_dom->domain);
-
- free_page((unsigned long)dma_dom->aperture[index]->bitmap);
-
- kfree(dma_dom->aperture[index]);
- dma_dom->aperture[index] = NULL;
-
- return -ENOMEM;
-}
-
-static unsigned long dma_ops_area_alloc(struct device *dev,
- struct dma_ops_domain *dom,
- unsigned int pages,
- unsigned long align_mask,
- u64 dma_mask,
- unsigned long start)
-{
- unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE;
- int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT;
- int i = start >> APERTURE_RANGE_SHIFT;
- unsigned long boundary_size;
- unsigned long address = -1;
- unsigned long limit;
-
- next_bit >>= PAGE_SHIFT;
-
- boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
- PAGE_SIZE) >> PAGE_SHIFT;
-
- for (;i < max_index; ++i) {
- unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT;
-
- if (dom->aperture[i]->offset >= dma_mask)
- break;
-
- limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset,
- dma_mask >> PAGE_SHIFT);
-
- address = iommu_area_alloc(dom->aperture[i]->bitmap,
- limit, next_bit, pages, 0,
- boundary_size, align_mask);
- if (address != -1) {
- address = dom->aperture[i]->offset +
- (address << PAGE_SHIFT);
- dom->next_address = address + (pages << PAGE_SHIFT);
- break;
- }
-
- next_bit = 0;
- }
-
- return address;
-}
-
-static unsigned long dma_ops_alloc_addresses(struct device *dev,
- struct dma_ops_domain *dom,
- unsigned int pages,
- unsigned long align_mask,
- u64 dma_mask)
-{
- unsigned long address;
-
-#ifdef CONFIG_IOMMU_STRESS
- dom->next_address = 0;
- dom->need_flush = true;
-#endif
-
- address = dma_ops_area_alloc(dev, dom, pages, align_mask,
- dma_mask, dom->next_address);
-
- if (address == -1) {
- dom->next_address = 0;
- address = dma_ops_area_alloc(dev, dom, pages, align_mask,
- dma_mask, 0);
- dom->need_flush = true;
- }
-
- if (unlikely(address == -1))
- address = DMA_ERROR_CODE;
-
- WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
-
- return address;
-}
-
-/*
- * The address free function.
- *
- * called with domain->lock held
- */
-static void dma_ops_free_addresses(struct dma_ops_domain *dom,
- unsigned long address,
- unsigned int pages)
-{
- unsigned i = address >> APERTURE_RANGE_SHIFT;
- struct aperture_range *range = dom->aperture[i];
-
- BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL);
-
-#ifdef CONFIG_IOMMU_STRESS
- if (i < 4)
- return;
-#endif
-
- if (address >= dom->next_address)
- dom->need_flush = true;
-
- address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
-
- bitmap_clear(range->bitmap, address, pages);
-
-}
-
-/****************************************************************************
- *
- * The next functions belong to the domain allocation. A domain is
- * allocated for every IOMMU as the default domain. If device isolation
- * is enabled, every device get its own domain. The most important thing
- * about domains is the page table mapping the DMA address space they
- * contain.
- *
- ****************************************************************************/
-
-/*
- * This function adds a protection domain to the global protection domain list
- */
-static void add_domain_to_list(struct protection_domain *domain)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&amd_iommu_pd_lock, flags);
- list_add(&domain->list, &amd_iommu_pd_list);
- spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
-}
-
-/*
- * This function removes a protection domain to the global
- * protection domain list
- */
-static void del_domain_from_list(struct protection_domain *domain)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&amd_iommu_pd_lock, flags);
- list_del(&domain->list);
- spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
-}
-
-static u16 domain_id_alloc(void)
-{
- unsigned long flags;
- int id;
-
- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
- id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
- BUG_ON(id == 0);
- if (id > 0 && id < MAX_DOMAIN_ID)
- __set_bit(id, amd_iommu_pd_alloc_bitmap);
- else
- id = 0;
- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-
- return id;
-}
-
-static void domain_id_free(int id)
-{
- unsigned long flags;
-
- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
- if (id > 0 && id < MAX_DOMAIN_ID)
- __clear_bit(id, amd_iommu_pd_alloc_bitmap);
- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-}
-
-static void free_pagetable(struct protection_domain *domain)
-{
- int i, j;
- u64 *p1, *p2, *p3;
-
- p1 = domain->pt_root;
-
- if (!p1)
- return;
-
- for (i = 0; i < 512; ++i) {
- if (!IOMMU_PTE_PRESENT(p1[i]))
- continue;
-
- p2 = IOMMU_PTE_PAGE(p1[i]);
- for (j = 0; j < 512; ++j) {
- if (!IOMMU_PTE_PRESENT(p2[j]))
- continue;
- p3 = IOMMU_PTE_PAGE(p2[j]);
- free_page((unsigned long)p3);
- }
-
- free_page((unsigned long)p2);
- }
-
- free_page((unsigned long)p1);
-
- domain->pt_root = NULL;
-}
-
-/*
- * Free a domain, only used if something went wrong in the
- * allocation path and we need to free an already allocated page table
- */
-static void dma_ops_domain_free(struct dma_ops_domain *dom)
-{
- int i;
-
- if (!dom)
- return;
-
- del_domain_from_list(&dom->domain);
-
- free_pagetable(&dom->domain);
-
- for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
- if (!dom->aperture[i])
- continue;
- free_page((unsigned long)dom->aperture[i]->bitmap);
- kfree(dom->aperture[i]);
- }
-
- kfree(dom);
-}
-
-/*
- * Allocates a new protection domain usable for the dma_ops functions.
- * It also initializes the page table and the address allocator data
- * structures required for the dma_ops interface
- */
-static struct dma_ops_domain *dma_ops_domain_alloc(void)
-{
- struct dma_ops_domain *dma_dom;
-
- dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
- if (!dma_dom)
- return NULL;
-
- spin_lock_init(&dma_dom->domain.lock);
-
- dma_dom->domain.id = domain_id_alloc();
- if (dma_dom->domain.id == 0)
- goto free_dma_dom;
- INIT_LIST_HEAD(&dma_dom->domain.dev_list);
- dma_dom->domain.mode = PAGE_MODE_2_LEVEL;
- dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
- dma_dom->domain.flags = PD_DMA_OPS_MASK;
- dma_dom->domain.priv = dma_dom;
- if (!dma_dom->domain.pt_root)
- goto free_dma_dom;
-
- dma_dom->need_flush = false;
- dma_dom->target_dev = 0xffff;
-
- add_domain_to_list(&dma_dom->domain);
-
- if (alloc_new_range(dma_dom, true, GFP_KERNEL))
- goto free_dma_dom;
-
- /*
- * mark the first page as allocated so we never return 0 as
- * a valid dma-address. So we can use 0 as error value
- */
- dma_dom->aperture[0]->bitmap[0] = 1;
- dma_dom->next_address = 0;
-
-
- return dma_dom;
-
-free_dma_dom:
- dma_ops_domain_free(dma_dom);
-
- return NULL;
-}
-
-/*
- * little helper function to check whether a given protection domain is a
- * dma_ops domain
- */
-static bool dma_ops_domain(struct protection_domain *domain)
-{
- return domain->flags & PD_DMA_OPS_MASK;
-}
-
-static void set_dte_entry(u16 devid, struct protection_domain *domain)
-{
- u64 pte_root = virt_to_phys(domain->pt_root);
-
- pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
- << DEV_ENTRY_MODE_SHIFT;
- pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
-
- amd_iommu_dev_table[devid].data[2] = domain->id;
- amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
- amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
-}
-
-static void clear_dte_entry(u16 devid)
-{
- /* remove entry from the device table seen by the hardware */
- amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
- amd_iommu_dev_table[devid].data[1] = 0;
- amd_iommu_dev_table[devid].data[2] = 0;
-
- amd_iommu_apply_erratum_63(devid);
-}
-
-static void do_attach(struct device *dev, struct protection_domain *domain)
-{
- struct iommu_dev_data *dev_data;
- struct amd_iommu *iommu;
- u16 devid;
-
- devid = get_device_id(dev);
- iommu = amd_iommu_rlookup_table[devid];
- dev_data = get_dev_data(dev);
-
- /* Update data structures */
- dev_data->domain = domain;
- list_add(&dev_data->list, &domain->dev_list);
- set_dte_entry(devid, domain);
-
- /* Do reference counting */
- domain->dev_iommu[iommu->index] += 1;
- domain->dev_cnt += 1;
-
- /* Flush the DTE entry */
- iommu_flush_device(dev);
-}
-
-static void do_detach(struct device *dev)
-{
- struct iommu_dev_data *dev_data;
- struct amd_iommu *iommu;
- u16 devid;
-
- devid = get_device_id(dev);
- iommu = amd_iommu_rlookup_table[devid];
- dev_data = get_dev_data(dev);
-
- /* decrease reference counters */
- dev_data->domain->dev_iommu[iommu->index] -= 1;
- dev_data->domain->dev_cnt -= 1;
-
- /* Update data structures */
- dev_data->domain = NULL;
- list_del(&dev_data->list);
- clear_dte_entry(devid);
-
- /* Flush the DTE entry */
- iommu_flush_device(dev);
-}
-
-/*
- * If a device is not yet associated with a domain, this function does
- * assigns it visible for the hardware
- */
-static int __attach_device(struct device *dev,
- struct protection_domain *domain)
-{
- struct iommu_dev_data *dev_data, *alias_data;
- int ret;
-
- dev_data = get_dev_data(dev);
- alias_data = get_dev_data(dev_data->alias);
-
- if (!alias_data)
- return -EINVAL;
-
- /* lock domain */
- spin_lock(&domain->lock);
-
- /* Some sanity checks */
- ret = -EBUSY;
- if (alias_data->domain != NULL &&
- alias_data->domain != domain)
- goto out_unlock;
-
- if (dev_data->domain != NULL &&
- dev_data->domain != domain)
- goto out_unlock;
-
- /* Do real assignment */
- if (dev_data->alias != dev) {
- alias_data = get_dev_data(dev_data->alias);
- if (alias_data->domain == NULL)
- do_attach(dev_data->alias, domain);
-
- atomic_inc(&alias_data->bind);
- }
-
- if (dev_data->domain == NULL)
- do_attach(dev, domain);
-
- atomic_inc(&dev_data->bind);
-
- ret = 0;
-
-out_unlock:
-
- /* ready */
- spin_unlock(&domain->lock);
-
- return ret;
-}
-
-/*
- * If a device is not yet associated with a domain, this function does
- * assigns it visible for the hardware
- */
-static int attach_device(struct device *dev,
- struct protection_domain *domain)
-{
- unsigned long flags;
- int ret;
-
- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
- ret = __attach_device(dev, domain);
- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-
- /*
- * We might boot into a crash-kernel here. The crashed kernel
- * left the caches in the IOMMU dirty. So we have to flush
- * here to evict all dirty stuff.
- */
- iommu_flush_tlb_pde(domain);
-
- return ret;
-}
-
-/*
- * Removes a device from a protection domain (unlocked)
- */
-static void __detach_device(struct device *dev)
-{
- struct iommu_dev_data *dev_data = get_dev_data(dev);
- struct iommu_dev_data *alias_data;
- struct protection_domain *domain;
- unsigned long flags;
-
- BUG_ON(!dev_data->domain);
-
- domain = dev_data->domain;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- if (dev_data->alias != dev) {
- alias_data = get_dev_data(dev_data->alias);
- if (atomic_dec_and_test(&alias_data->bind))
- do_detach(dev_data->alias);
- }
-
- if (atomic_dec_and_test(&dev_data->bind))
- do_detach(dev);
-
- spin_unlock_irqrestore(&domain->lock, flags);
-
- /*
- * If we run in passthrough mode the device must be assigned to the
- * passthrough domain if it is detached from any other domain.
- * Make sure we can deassign from the pt_domain itself.
- */
- if (iommu_pass_through &&
- (dev_data->domain == NULL && domain != pt_domain))
- __attach_device(dev, pt_domain);
-}
-
-/*
- * Removes a device from a protection domain (with devtable_lock held)
- */
-static void detach_device(struct device *dev)
-{
- unsigned long flags;
-
- /* lock device table */
- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
- __detach_device(dev);
- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-}
-
-/*
- * Find out the protection domain structure for a given PCI device. This
- * will give us the pointer to the page table root for example.
- */
-static struct protection_domain *domain_for_device(struct device *dev)
-{
- struct protection_domain *dom;
- struct iommu_dev_data *dev_data, *alias_data;
- unsigned long flags;
- u16 devid, alias;
-
- devid = get_device_id(dev);
- alias = amd_iommu_alias_table[devid];
- dev_data = get_dev_data(dev);
- alias_data = get_dev_data(dev_data->alias);
- if (!alias_data)
- return NULL;
-
- read_lock_irqsave(&amd_iommu_devtable_lock, flags);
- dom = dev_data->domain;
- if (dom == NULL &&
- alias_data->domain != NULL) {
- __attach_device(dev, alias_data->domain);
- dom = alias_data->domain;
- }
-
- read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-
- return dom;
-}
-
-static int device_change_notifier(struct notifier_block *nb,
- unsigned long action, void *data)
-{
- struct device *dev = data;
- u16 devid;
- struct protection_domain *domain;
- struct dma_ops_domain *dma_domain;
- struct amd_iommu *iommu;
- unsigned long flags;
-
- if (!check_device(dev))
- return 0;
-
- devid = get_device_id(dev);
- iommu = amd_iommu_rlookup_table[devid];
-
- switch (action) {
- case BUS_NOTIFY_UNBOUND_DRIVER:
-
- domain = domain_for_device(dev);
-
- if (!domain)
- goto out;
- if (iommu_pass_through)
- break;
- detach_device(dev);
- break;
- case BUS_NOTIFY_ADD_DEVICE:
-
- iommu_init_device(dev);
-
- domain = domain_for_device(dev);
-
- /* allocate a protection domain if a device is added */
- dma_domain = find_protection_domain(devid);
- if (dma_domain)
- goto out;
- dma_domain = dma_ops_domain_alloc();
- if (!dma_domain)
- goto out;
- dma_domain->target_dev = devid;
-
- spin_lock_irqsave(&iommu_pd_list_lock, flags);
- list_add_tail(&dma_domain->list, &iommu_pd_list);
- spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
-
- break;
- case BUS_NOTIFY_DEL_DEVICE:
-
- iommu_uninit_device(dev);
-
- default:
- goto out;
- }
-
- iommu_flush_device(dev);
- iommu_completion_wait(iommu);
-
-out:
- return 0;
-}
-
-static struct notifier_block device_nb = {
- .notifier_call = device_change_notifier,
-};
-
-void amd_iommu_init_notifier(void)
-{
- bus_register_notifier(&pci_bus_type, &device_nb);
-}
-
-/*****************************************************************************
- *
- * The next functions belong to the dma_ops mapping/unmapping code.
- *
- *****************************************************************************/
-
-/*
- * In the dma_ops path we only have the struct device. This function
- * finds the corresponding IOMMU, the protection domain and the
- * requestor id for a given device.
- * If the device is not yet associated with a domain this is also done
- * in this function.
- */
-static struct protection_domain *get_domain(struct device *dev)
-{
- struct protection_domain *domain;
- struct dma_ops_domain *dma_dom;
- u16 devid = get_device_id(dev);
-
- if (!check_device(dev))
- return ERR_PTR(-EINVAL);
-
- domain = domain_for_device(dev);
- if (domain != NULL && !dma_ops_domain(domain))
- return ERR_PTR(-EBUSY);
-
- if (domain != NULL)
- return domain;
-
- /* Device not bount yet - bind it */
- dma_dom = find_protection_domain(devid);
- if (!dma_dom)
- dma_dom = amd_iommu_rlookup_table[devid]->default_dom;
- attach_device(dev, &dma_dom->domain);
- DUMP_printk("Using protection domain %d for device %s\n",
- dma_dom->domain.id, dev_name(dev));
-
- return &dma_dom->domain;
-}
-
-static void update_device_table(struct protection_domain *domain)
-{
- struct iommu_dev_data *dev_data;
-
- list_for_each_entry(dev_data, &domain->dev_list, list) {
- u16 devid = get_device_id(dev_data->dev);
- set_dte_entry(devid, domain);
- }
-}
-
-static void update_domain(struct protection_domain *domain)
-{
- if (!domain->updated)
- return;
-
- update_device_table(domain);
- iommu_flush_domain_devices(domain);
- iommu_flush_tlb_pde(domain);
-
- domain->updated = false;
-}
-
-/*
- * This function fetches the PTE for a given address in the aperture
- */
-static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
- unsigned long address)
-{
- struct aperture_range *aperture;
- u64 *pte, *pte_page;
-
- aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
- if (!aperture)
- return NULL;
-
- pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
- if (!pte) {
- pte = alloc_pte(&dom->domain, address, PAGE_SIZE, &pte_page,
- GFP_ATOMIC);
- aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
- } else
- pte += PM_LEVEL_INDEX(0, address);
-
- update_domain(&dom->domain);
-
- return pte;
-}
-
-/*
- * This is the generic map function. It maps one 4kb page at paddr to
- * the given address in the DMA address space for the domain.
- */
-static dma_addr_t dma_ops_domain_map(struct dma_ops_domain *dom,
- unsigned long address,
- phys_addr_t paddr,
- int direction)
-{
- u64 *pte, __pte;
-
- WARN_ON(address > dom->aperture_size);
-
- paddr &= PAGE_MASK;
-
- pte = dma_ops_get_pte(dom, address);
- if (!pte)
- return DMA_ERROR_CODE;
-
- __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
-
- if (direction == DMA_TO_DEVICE)
- __pte |= IOMMU_PTE_IR;
- else if (direction == DMA_FROM_DEVICE)
- __pte |= IOMMU_PTE_IW;
- else if (direction == DMA_BIDIRECTIONAL)
- __pte |= IOMMU_PTE_IR | IOMMU_PTE_IW;
-
- WARN_ON(*pte);
-
- *pte = __pte;
-
- return (dma_addr_t)address;
-}
-
-/*
- * The generic unmapping function for on page in the DMA address space.
- */
-static void dma_ops_domain_unmap(struct dma_ops_domain *dom,
- unsigned long address)
-{
- struct aperture_range *aperture;
- u64 *pte;
-
- if (address >= dom->aperture_size)
- return;
-
- aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
- if (!aperture)
- return;
-
- pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
- if (!pte)
- return;
-
- pte += PM_LEVEL_INDEX(0, address);
-
- WARN_ON(!*pte);
-
- *pte = 0ULL;
-}
-
-/*
- * This function contains common code for mapping of a physically
- * contiguous memory region into DMA address space. It is used by all
- * mapping functions provided with this IOMMU driver.
- * Must be called with the domain lock held.
- */
-static dma_addr_t __map_single(struct device *dev,
- struct dma_ops_domain *dma_dom,
- phys_addr_t paddr,
- size_t size,
- int dir,
- bool align,
- u64 dma_mask)
-{
- dma_addr_t offset = paddr & ~PAGE_MASK;
- dma_addr_t address, start, ret;
- unsigned int pages;
- unsigned long align_mask = 0;
- int i;
-
- pages = iommu_num_pages(paddr, size, PAGE_SIZE);
- paddr &= PAGE_MASK;
-
- INC_STATS_COUNTER(total_map_requests);
-
- if (pages > 1)
- INC_STATS_COUNTER(cross_page);
-
- if (align)
- align_mask = (1UL << get_order(size)) - 1;
-
-retry:
- address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
- dma_mask);
- if (unlikely(address == DMA_ERROR_CODE)) {
- /*
- * setting next_address here will let the address
- * allocator only scan the new allocated range in the
- * first run. This is a small optimization.
- */
- dma_dom->next_address = dma_dom->aperture_size;
-
- if (alloc_new_range(dma_dom, false, GFP_ATOMIC))
- goto out;
-
- /*
- * aperture was successfully enlarged by 128 MB, try
- * allocation again
- */
- goto retry;
- }
-
- start = address;
- for (i = 0; i < pages; ++i) {
- ret = dma_ops_domain_map(dma_dom, start, paddr, dir);
- if (ret == DMA_ERROR_CODE)
- goto out_unmap;
-
- paddr += PAGE_SIZE;
- start += PAGE_SIZE;
- }
- address += offset;
-
- ADD_STATS_COUNTER(alloced_io_mem, size);
-
- if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
- iommu_flush_tlb(&dma_dom->domain);
- dma_dom->need_flush = false;
- } else if (unlikely(amd_iommu_np_cache))
- iommu_flush_pages(&dma_dom->domain, address, size);
-
-out:
- return address;
-
-out_unmap:
-
- for (--i; i >= 0; --i) {
- start -= PAGE_SIZE;
- dma_ops_domain_unmap(dma_dom, start);
- }
-
- dma_ops_free_addresses(dma_dom, address, pages);
-
- return DMA_ERROR_CODE;
-}
-
-/*
- * Does the reverse of the __map_single function. Must be called with
- * the domain lock held too
- */
-static void __unmap_single(struct dma_ops_domain *dma_dom,
- dma_addr_t dma_addr,
- size_t size,
- int dir)
-{
- dma_addr_t flush_addr;
- dma_addr_t i, start;
- unsigned int pages;
-
- if ((dma_addr == DMA_ERROR_CODE) ||
- (dma_addr + size > dma_dom->aperture_size))
- return;
-
- flush_addr = dma_addr;
- pages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
- dma_addr &= PAGE_MASK;
- start = dma_addr;
-
- for (i = 0; i < pages; ++i) {
- dma_ops_domain_unmap(dma_dom, start);
- start += PAGE_SIZE;
- }
-
- SUB_STATS_COUNTER(alloced_io_mem, size);
-
- dma_ops_free_addresses(dma_dom, dma_addr, pages);
-
- if (amd_iommu_unmap_flush || dma_dom->need_flush) {
- iommu_flush_pages(&dma_dom->domain, flush_addr, size);
- dma_dom->need_flush = false;
- }
-}
-
-/*
- * The exported map_single function for dma_ops.
- */
-static dma_addr_t map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size,
- enum dma_data_direction dir,
- struct dma_attrs *attrs)
-{
- unsigned long flags;
- struct protection_domain *domain;
- dma_addr_t addr;
- u64 dma_mask;
- phys_addr_t paddr = page_to_phys(page) + offset;
-
- INC_STATS_COUNTER(cnt_map_single);
-
- domain = get_domain(dev);
- if (PTR_ERR(domain) == -EINVAL)
- return (dma_addr_t)paddr;
- else if (IS_ERR(domain))
- return DMA_ERROR_CODE;
-
- dma_mask = *dev->dma_mask;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- addr = __map_single(dev, domain->priv, paddr, size, dir, false,
- dma_mask);
- if (addr == DMA_ERROR_CODE)
- goto out;
-
- iommu_flush_complete(domain);
-
-out:
- spin_unlock_irqrestore(&domain->lock, flags);
-
- return addr;
-}
-
-/*
- * The exported unmap_single function for dma_ops.
- */
-static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
- enum dma_data_direction dir, struct dma_attrs *attrs)
-{
- unsigned long flags;
- struct protection_domain *domain;
-
- INC_STATS_COUNTER(cnt_unmap_single);
-
- domain = get_domain(dev);
- if (IS_ERR(domain))
- return;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- __unmap_single(domain->priv, dma_addr, size, dir);
-
- iommu_flush_complete(domain);
-
- spin_unlock_irqrestore(&domain->lock, flags);
-}
-
-/*
- * This is a special map_sg function which is used if we should map a
- * device which is not handled by an AMD IOMMU in the system.
- */
-static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
- int nelems, int dir)
-{
- struct scatterlist *s;
- int i;
-
- for_each_sg(sglist, s, nelems, i) {
- s->dma_address = (dma_addr_t)sg_phys(s);
- s->dma_length = s->length;
- }
-
- return nelems;
-}
-
-/*
- * The exported map_sg function for dma_ops (handles scatter-gather
- * lists).
- */
-static int map_sg(struct device *dev, struct scatterlist *sglist,
- int nelems, enum dma_data_direction dir,
- struct dma_attrs *attrs)
-{
- unsigned long flags;
- struct protection_domain *domain;
- int i;
- struct scatterlist *s;
- phys_addr_t paddr;
- int mapped_elems = 0;
- u64 dma_mask;
-
- INC_STATS_COUNTER(cnt_map_sg);
-
- domain = get_domain(dev);
- if (PTR_ERR(domain) == -EINVAL)
- return map_sg_no_iommu(dev, sglist, nelems, dir);
- else if (IS_ERR(domain))
- return 0;
-
- dma_mask = *dev->dma_mask;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- for_each_sg(sglist, s, nelems, i) {
- paddr = sg_phys(s);
-
- s->dma_address = __map_single(dev, domain->priv,
- paddr, s->length, dir, false,
- dma_mask);
-
- if (s->dma_address) {
- s->dma_length = s->length;
- mapped_elems++;
- } else
- goto unmap;
- }
-
- iommu_flush_complete(domain);
-
-out:
- spin_unlock_irqrestore(&domain->lock, flags);
-
- return mapped_elems;
-unmap:
- for_each_sg(sglist, s, mapped_elems, i) {
- if (s->dma_address)
- __unmap_single(domain->priv, s->dma_address,
- s->dma_length, dir);
- s->dma_address = s->dma_length = 0;
- }
-
- mapped_elems = 0;
-
- goto out;
-}
-
-/*
- * The exported map_sg function for dma_ops (handles scatter-gather
- * lists).
- */
-static void unmap_sg(struct device *dev, struct scatterlist *sglist,
- int nelems, enum dma_data_direction dir,
- struct dma_attrs *attrs)
-{
- unsigned long flags;
- struct protection_domain *domain;
- struct scatterlist *s;
- int i;
-
- INC_STATS_COUNTER(cnt_unmap_sg);
-
- domain = get_domain(dev);
- if (IS_ERR(domain))
- return;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- for_each_sg(sglist, s, nelems, i) {
- __unmap_single(domain->priv, s->dma_address,
- s->dma_length, dir);
- s->dma_address = s->dma_length = 0;
- }
-
- iommu_flush_complete(domain);
-
- spin_unlock_irqrestore(&domain->lock, flags);
-}
-
-/*
- * The exported alloc_coherent function for dma_ops.
- */
-static void *alloc_coherent(struct device *dev, size_t size,
- dma_addr_t *dma_addr, gfp_t flag)
-{
- unsigned long flags;
- void *virt_addr;
- struct protection_domain *domain;
- phys_addr_t paddr;
- u64 dma_mask = dev->coherent_dma_mask;
-
- INC_STATS_COUNTER(cnt_alloc_coherent);
-
- domain = get_domain(dev);
- if (PTR_ERR(domain) == -EINVAL) {
- virt_addr = (void *)__get_free_pages(flag, get_order(size));
- *dma_addr = __pa(virt_addr);
- return virt_addr;
- } else if (IS_ERR(domain))
- return NULL;
-
- dma_mask = dev->coherent_dma_mask;
- flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
- flag |= __GFP_ZERO;
-
- virt_addr = (void *)__get_free_pages(flag, get_order(size));
- if (!virt_addr)
- return NULL;
-
- paddr = virt_to_phys(virt_addr);
-
- if (!dma_mask)
- dma_mask = *dev->dma_mask;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- *dma_addr = __map_single(dev, domain->priv, paddr,
- size, DMA_BIDIRECTIONAL, true, dma_mask);
-
- if (*dma_addr == DMA_ERROR_CODE) {
- spin_unlock_irqrestore(&domain->lock, flags);
- goto out_free;
- }
-
- iommu_flush_complete(domain);
-
- spin_unlock_irqrestore(&domain->lock, flags);
-
- return virt_addr;
-
-out_free:
-
- free_pages((unsigned long)virt_addr, get_order(size));
-
- return NULL;
-}
-
-/*
- * The exported free_coherent function for dma_ops.
- */
-static void free_coherent(struct device *dev, size_t size,
- void *virt_addr, dma_addr_t dma_addr)
-{
- unsigned long flags;
- struct protection_domain *domain;
-
- INC_STATS_COUNTER(cnt_free_coherent);
-
- domain = get_domain(dev);
- if (IS_ERR(domain))
- goto free_mem;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
-
- iommu_flush_complete(domain);
-
- spin_unlock_irqrestore(&domain->lock, flags);
-
-free_mem:
- free_pages((unsigned long)virt_addr, get_order(size));
-}
-
-/*
- * This function is called by the DMA layer to find out if we can handle a
- * particular device. It is part of the dma_ops.
- */
-static int amd_iommu_dma_supported(struct device *dev, u64 mask)
-{
- return check_device(dev);
-}
-
-/*
- * The function for pre-allocating protection domains.
- *
- * If the driver core informs the DMA layer if a driver grabs a device
- * we don't need to preallocate the protection domains anymore.
- * For now we have to.
- */
-static void prealloc_protection_domains(void)
-{
- struct pci_dev *dev = NULL;
- struct dma_ops_domain *dma_dom;
- u16 devid;
-
- for_each_pci_dev(dev) {
-
- /* Do we handle this device? */
- if (!check_device(&dev->dev))
- continue;
-
- /* Is there already any domain for it? */
- if (domain_for_device(&dev->dev))
- continue;
-
- devid = get_device_id(&dev->dev);
-
- dma_dom = dma_ops_domain_alloc();
- if (!dma_dom)
- continue;
- init_unity_mappings_for_device(dma_dom, devid);
- dma_dom->target_dev = devid;
-
- attach_device(&dev->dev, &dma_dom->domain);
-
- list_add_tail(&dma_dom->list, &iommu_pd_list);
- }
-}
-
-static struct dma_map_ops amd_iommu_dma_ops = {
- .alloc_coherent = alloc_coherent,
- .free_coherent = free_coherent,
- .map_page = map_page,
- .unmap_page = unmap_page,
- .map_sg = map_sg,
- .unmap_sg = unmap_sg,
- .dma_supported = amd_iommu_dma_supported,
-};
-
-/*
- * The function which clues the AMD IOMMU driver into dma_ops.
- */
-
-void __init amd_iommu_init_api(void)
-{
- register_iommu(&amd_iommu_ops);
-}
-
-int __init amd_iommu_init_dma_ops(void)
-{
- struct amd_iommu *iommu;
- int ret;
-
- /*
- * first allocate a default protection domain for every IOMMU we
- * found in the system. Devices not assigned to any other
- * protection domain will be assigned to the default one.
- */
- for_each_iommu(iommu) {
- iommu->default_dom = dma_ops_domain_alloc();
- if (iommu->default_dom == NULL)
- return -ENOMEM;
- iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
- ret = iommu_init_unity_mappings(iommu);
- if (ret)
- goto free_domains;
- }
-
- /*
- * Pre-allocate the protection domains for each device.
- */
- prealloc_protection_domains();
-
- iommu_detected = 1;
- swiotlb = 0;
-
- /* Make the driver finally visible to the drivers */
- dma_ops = &amd_iommu_dma_ops;
-
- amd_iommu_stats_init();
-
- return 0;
-
-free_domains:
-
- for_each_iommu(iommu) {
- if (iommu->default_dom)
- dma_ops_domain_free(iommu->default_dom);
- }
-
- return ret;
-}
-
-/*****************************************************************************
- *
- * The following functions belong to the exported interface of AMD IOMMU
- *
- * This interface allows access to lower level functions of the IOMMU
- * like protection domain handling and assignement of devices to domains
- * which is not possible with the dma_ops interface.
- *
- *****************************************************************************/
-
-static void cleanup_domain(struct protection_domain *domain)
-{
- struct iommu_dev_data *dev_data, *next;
- unsigned long flags;
-
- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
-
- list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) {
- struct device *dev = dev_data->dev;
-
- __detach_device(dev);
- atomic_set(&dev_data->bind, 0);
- }
-
- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-}
-
-static void protection_domain_free(struct protection_domain *domain)
-{
- if (!domain)
- return;
-
- del_domain_from_list(domain);
-
- if (domain->id)
- domain_id_free(domain->id);
-
- kfree(domain);
-}
-
-static struct protection_domain *protection_domain_alloc(void)
-{
- struct protection_domain *domain;
-
- domain = kzalloc(sizeof(*domain), GFP_KERNEL);
- if (!domain)
- return NULL;
-
- spin_lock_init(&domain->lock);
- mutex_init(&domain->api_lock);
- domain->id = domain_id_alloc();
- if (!domain->id)
- goto out_err;
- INIT_LIST_HEAD(&domain->dev_list);
-
- add_domain_to_list(domain);
-
- return domain;
-
-out_err:
- kfree(domain);
-
- return NULL;
-}
-
-static int amd_iommu_domain_init(struct iommu_domain *dom)
-{
- struct protection_domain *domain;
-
- domain = protection_domain_alloc();
- if (!domain)
- goto out_free;
-
- domain->mode = PAGE_MODE_3_LEVEL;
- domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
- if (!domain->pt_root)
- goto out_free;
-
- dom->priv = domain;
-
- return 0;
-
-out_free:
- protection_domain_free(domain);
-
- return -ENOMEM;
-}
-
-static void amd_iommu_domain_destroy(struct iommu_domain *dom)
-{
- struct protection_domain *domain = dom->priv;
-
- if (!domain)
- return;
-
- if (domain->dev_cnt > 0)
- cleanup_domain(domain);
-
- BUG_ON(domain->dev_cnt != 0);
-
- free_pagetable(domain);
-
- protection_domain_free(domain);
-
- dom->priv = NULL;
-}
-
-static void amd_iommu_detach_device(struct iommu_domain *dom,
- struct device *dev)
-{
- struct iommu_dev_data *dev_data = dev->archdata.iommu;
- struct amd_iommu *iommu;
- u16 devid;
-
- if (!check_device(dev))
- return;
-
- devid = get_device_id(dev);
-
- if (dev_data->domain != NULL)
- detach_device(dev);
-
- iommu = amd_iommu_rlookup_table[devid];
- if (!iommu)
- return;
-
- iommu_flush_device(dev);
- iommu_completion_wait(iommu);
-}
-
-static int amd_iommu_attach_device(struct iommu_domain *dom,
- struct device *dev)
-{
- struct protection_domain *domain = dom->priv;
- struct iommu_dev_data *dev_data;
- struct amd_iommu *iommu;
- int ret;
- u16 devid;
-
- if (!check_device(dev))
- return -EINVAL;
-
- dev_data = dev->archdata.iommu;
-
- devid = get_device_id(dev);
-
- iommu = amd_iommu_rlookup_table[devid];
- if (!iommu)
- return -EINVAL;
-
- if (dev_data->domain)
- detach_device(dev);
-
- ret = attach_device(dev, domain);
-
- iommu_completion_wait(iommu);
-
- return ret;
-}
-
-static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
- phys_addr_t paddr, int gfp_order, int iommu_prot)
-{
- unsigned long page_size = 0x1000UL << gfp_order;
- struct protection_domain *domain = dom->priv;
- int prot = 0;
- int ret;
-
- if (iommu_prot & IOMMU_READ)
- prot |= IOMMU_PROT_IR;
- if (iommu_prot & IOMMU_WRITE)
- prot |= IOMMU_PROT_IW;
-
- mutex_lock(&domain->api_lock);
- ret = iommu_map_page(domain, iova, paddr, prot, page_size);
- mutex_unlock(&domain->api_lock);
-
- return ret;
-}
-
-static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
- int gfp_order)
-{
- struct protection_domain *domain = dom->priv;
- unsigned long page_size, unmap_size;
-
- page_size = 0x1000UL << gfp_order;
-
- mutex_lock(&domain->api_lock);
- unmap_size = iommu_unmap_page(domain, iova, page_size);
- mutex_unlock(&domain->api_lock);
-
- iommu_flush_tlb_pde(domain);
-
- return get_order(unmap_size);
-}
-
-static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
- unsigned long iova)
-{
- struct protection_domain *domain = dom->priv;
- unsigned long offset_mask;
- phys_addr_t paddr;
- u64 *pte, __pte;
-
- pte = fetch_pte(domain, iova);
-
- if (!pte || !IOMMU_PTE_PRESENT(*pte))
- return 0;
-
- if (PM_PTE_LEVEL(*pte) == 0)
- offset_mask = PAGE_SIZE - 1;
- else
- offset_mask = PTE_PAGE_SIZE(*pte) - 1;
-
- __pte = *pte & PM_ADDR_MASK;
- paddr = (__pte & ~offset_mask) | (iova & offset_mask);
-
- return paddr;
-}
-
-static int amd_iommu_domain_has_cap(struct iommu_domain *domain,
- unsigned long cap)
-{
- switch (cap) {
- case IOMMU_CAP_CACHE_COHERENCY:
- return 1;
- }
-
- return 0;
-}
-
-static struct iommu_ops amd_iommu_ops = {
- .domain_init = amd_iommu_domain_init,
- .domain_destroy = amd_iommu_domain_destroy,
- .attach_dev = amd_iommu_attach_device,
- .detach_dev = amd_iommu_detach_device,
- .map = amd_iommu_map,
- .unmap = amd_iommu_unmap,
- .iova_to_phys = amd_iommu_iova_to_phys,
- .domain_has_cap = amd_iommu_domain_has_cap,
-};
-
-/*****************************************************************************
- *
- * The next functions do a basic initialization of IOMMU for pass through
- * mode
- *
- * In passthrough mode the IOMMU is initialized and enabled but not used for
- * DMA-API translation.
- *
- *****************************************************************************/
-
-int __init amd_iommu_init_passthrough(void)
-{
- struct amd_iommu *iommu;
- struct pci_dev *dev = NULL;
- u16 devid;
-
- /* allocate passthrough domain */
- pt_domain = protection_domain_alloc();
- if (!pt_domain)
- return -ENOMEM;
-
- pt_domain->mode |= PAGE_MODE_NONE;
-
- for_each_pci_dev(dev) {
- if (!check_device(&dev->dev))
- continue;
-
- devid = get_device_id(&dev->dev);
-
- iommu = amd_iommu_rlookup_table[devid];
- if (!iommu)
- continue;
-
- attach_device(&dev->dev, pt_domain);
- }
-
- pr_info("AMD-Vi: Initialized for Passthrough Mode\n");
-
- return 0;
-}
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
deleted file mode 100644
index 246d727b65b..00000000000
--- a/arch/x86/kernel/amd_iommu_init.c
+++ /dev/null
@@ -1,1540 +0,0 @@
-/*
- * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- * Leo Duran <leo.duran@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#include <linux/pci.h>
-#include <linux/acpi.h>
-#include <linux/list.h>
-#include <linux/slab.h>
-#include <linux/syscore_ops.h>
-#include <linux/interrupt.h>
-#include <linux/msi.h>
-#include <asm/pci-direct.h>
-#include <asm/amd_iommu_proto.h>
-#include <asm/amd_iommu_types.h>
-#include <asm/amd_iommu.h>
-#include <asm/iommu.h>
-#include <asm/gart.h>
-#include <asm/x86_init.h>
-#include <asm/iommu_table.h>
-/*
- * definitions for the ACPI scanning code
- */
-#define IVRS_HEADER_LENGTH 48
-
-#define ACPI_IVHD_TYPE 0x10
-#define ACPI_IVMD_TYPE_ALL 0x20
-#define ACPI_IVMD_TYPE 0x21
-#define ACPI_IVMD_TYPE_RANGE 0x22
-
-#define IVHD_DEV_ALL 0x01
-#define IVHD_DEV_SELECT 0x02
-#define IVHD_DEV_SELECT_RANGE_START 0x03
-#define IVHD_DEV_RANGE_END 0x04
-#define IVHD_DEV_ALIAS 0x42
-#define IVHD_DEV_ALIAS_RANGE 0x43
-#define IVHD_DEV_EXT_SELECT 0x46
-#define IVHD_DEV_EXT_SELECT_RANGE 0x47
-
-#define IVHD_FLAG_HT_TUN_EN_MASK 0x01
-#define IVHD_FLAG_PASSPW_EN_MASK 0x02
-#define IVHD_FLAG_RESPASSPW_EN_MASK 0x04
-#define IVHD_FLAG_ISOC_EN_MASK 0x08
-
-#define IVMD_FLAG_EXCL_RANGE 0x08
-#define IVMD_FLAG_UNITY_MAP 0x01
-
-#define ACPI_DEVFLAG_INITPASS 0x01
-#define ACPI_DEVFLAG_EXTINT 0x02
-#define ACPI_DEVFLAG_NMI 0x04
-#define ACPI_DEVFLAG_SYSMGT1 0x10
-#define ACPI_DEVFLAG_SYSMGT2 0x20
-#define ACPI_DEVFLAG_LINT0 0x40
-#define ACPI_DEVFLAG_LINT1 0x80
-#define ACPI_DEVFLAG_ATSDIS 0x10000000
-
-/*
- * ACPI table definitions
- *
- * These data structures are laid over the table to parse the important values
- * out of it.
- */
-
-/*
- * structure describing one IOMMU in the ACPI table. Typically followed by one
- * or more ivhd_entrys.
- */
-struct ivhd_header {
- u8 type;
- u8 flags;
- u16 length;
- u16 devid;
- u16 cap_ptr;
- u64 mmio_phys;
- u16 pci_seg;
- u16 info;
- u32 reserved;
-} __attribute__((packed));
-
-/*
- * A device entry describing which devices a specific IOMMU translates and
- * which requestor ids they use.
- */
-struct ivhd_entry {
- u8 type;
- u16 devid;
- u8 flags;
- u32 ext;
-} __attribute__((packed));
-
-/*
- * An AMD IOMMU memory definition structure. It defines things like exclusion
- * ranges for devices and regions that should be unity mapped.
- */
-struct ivmd_header {
- u8 type;
- u8 flags;
- u16 length;
- u16 devid;
- u16 aux;
- u64 resv;
- u64 range_start;
- u64 range_length;
-} __attribute__((packed));
-
-bool amd_iommu_dump;
-
-static int __initdata amd_iommu_detected;
-static bool __initdata amd_iommu_disabled;
-
-u16 amd_iommu_last_bdf; /* largest PCI device id we have
- to handle */
-LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
- we find in ACPI */
-bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
-
-LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
- system */
-
-/* Array to assign indices to IOMMUs*/
-struct amd_iommu *amd_iommus[MAX_IOMMUS];
-int amd_iommus_present;
-
-/* IOMMUs have a non-present cache? */
-bool amd_iommu_np_cache __read_mostly;
-
-/*
- * The ACPI table parsing functions set this variable on an error
- */
-static int __initdata amd_iommu_init_err;
-
-/*
- * List of protection domains - used during resume
- */
-LIST_HEAD(amd_iommu_pd_list);
-spinlock_t amd_iommu_pd_lock;
-
-/*
- * Pointer to the device table which is shared by all AMD IOMMUs
- * it is indexed by the PCI device id or the HT unit id and contains
- * information about the domain the device belongs to as well as the
- * page table root pointer.
- */
-struct dev_table_entry *amd_iommu_dev_table;
-
-/*
- * The alias table is a driver specific data structure which contains the
- * mappings of the PCI device ids to the actual requestor ids on the IOMMU.
- * More than one device can share the same requestor id.
- */
-u16 *amd_iommu_alias_table;
-
-/*
- * The rlookup table is used to find the IOMMU which is responsible
- * for a specific device. It is also indexed by the PCI device id.
- */
-struct amd_iommu **amd_iommu_rlookup_table;
-
-/*
- * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
- * to know which ones are already in use.
- */
-unsigned long *amd_iommu_pd_alloc_bitmap;
-
-static u32 dev_table_size; /* size of the device table */
-static u32 alias_table_size; /* size of the alias table */
-static u32 rlookup_table_size; /* size if the rlookup table */
-
-static inline void update_last_devid(u16 devid)
-{
- if (devid > amd_iommu_last_bdf)
- amd_iommu_last_bdf = devid;
-}
-
-static inline unsigned long tbl_size(int entry_size)
-{
- unsigned shift = PAGE_SHIFT +
- get_order(((int)amd_iommu_last_bdf + 1) * entry_size);
-
- return 1UL << shift;
-}
-
-/* Access to l1 and l2 indexed register spaces */
-
-static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address)
-{
- u32 val;
-
- pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
- pci_read_config_dword(iommu->dev, 0xfc, &val);
- return val;
-}
-
-static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val)
-{
- pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31));
- pci_write_config_dword(iommu->dev, 0xfc, val);
- pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
-}
-
-static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address)
-{
- u32 val;
-
- pci_write_config_dword(iommu->dev, 0xf0, address);
- pci_read_config_dword(iommu->dev, 0xf4, &val);
- return val;
-}
-
-static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val)
-{
- pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8));
- pci_write_config_dword(iommu->dev, 0xf4, val);
-}
-
-/****************************************************************************
- *
- * AMD IOMMU MMIO register space handling functions
- *
- * These functions are used to program the IOMMU device registers in
- * MMIO space required for that driver.
- *
- ****************************************************************************/
-
-/*
- * This function set the exclusion range in the IOMMU. DMA accesses to the
- * exclusion range are passed through untranslated
- */
-static void iommu_set_exclusion_range(struct amd_iommu *iommu)
-{
- u64 start = iommu->exclusion_start & PAGE_MASK;
- u64 limit = (start + iommu->exclusion_length) & PAGE_MASK;
- u64 entry;
-
- if (!iommu->exclusion_start)
- return;
-
- entry = start | MMIO_EXCL_ENABLE_MASK;
- memcpy_toio(iommu->mmio_base + MMIO_EXCL_BASE_OFFSET,
- &entry, sizeof(entry));
-
- entry = limit;
- memcpy_toio(iommu->mmio_base + MMIO_EXCL_LIMIT_OFFSET,
- &entry, sizeof(entry));
-}
-
-/* Programs the physical address of the device table into the IOMMU hardware */
-static void __init iommu_set_device_table(struct amd_iommu *iommu)
-{
- u64 entry;
-
- BUG_ON(iommu->mmio_base == NULL);
-
- entry = virt_to_phys(amd_iommu_dev_table);
- entry |= (dev_table_size >> 12) - 1;
- memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET,
- &entry, sizeof(entry));
-}
-
-/* Generic functions to enable/disable certain features of the IOMMU. */
-static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
-{
- u32 ctrl;
-
- ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
- ctrl |= (1 << bit);
- writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
-}
-
-static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
-{
- u32 ctrl;
-
- ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
- ctrl &= ~(1 << bit);
- writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
-}
-
-/* Function to enable the hardware */
-static void iommu_enable(struct amd_iommu *iommu)
-{
- printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx\n",
- dev_name(&iommu->dev->dev), iommu->cap_ptr);
-
- iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
-}
-
-static void iommu_disable(struct amd_iommu *iommu)
-{
- /* Disable command buffer */
- iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
-
- /* Disable event logging and event interrupts */
- iommu_feature_disable(iommu, CONTROL_EVT_INT_EN);
- iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN);
-
- /* Disable IOMMU hardware itself */
- iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
-}
-
-/*
- * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
- * the system has one.
- */
-static u8 * __init iommu_map_mmio_space(u64 address)
-{
- u8 *ret;
-
- if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu")) {
- pr_err("AMD-Vi: Can not reserve memory region %llx for mmio\n",
- address);
- pr_err("AMD-Vi: This is a BIOS bug. Please contact your hardware vendor\n");
- return NULL;
- }
-
- ret = ioremap_nocache(address, MMIO_REGION_LENGTH);
- if (ret != NULL)
- return ret;
-
- release_mem_region(address, MMIO_REGION_LENGTH);
-
- return NULL;
-}
-
-static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
-{
- if (iommu->mmio_base)
- iounmap(iommu->mmio_base);
- release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH);
-}
-
-/****************************************************************************
- *
- * The functions below belong to the first pass of AMD IOMMU ACPI table
- * parsing. In this pass we try to find out the highest device id this
- * code has to handle. Upon this information the size of the shared data
- * structures is determined later.
- *
- ****************************************************************************/
-
-/*
- * This function calculates the length of a given IVHD entry
- */
-static inline int ivhd_entry_length(u8 *ivhd)
-{
- return 0x04 << (*ivhd >> 6);
-}
-
-/*
- * This function reads the last device id the IOMMU has to handle from the PCI
- * capability header for this IOMMU
- */
-static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr)
-{
- u32 cap;
-
- cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
- update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
-
- return 0;
-}
-
-/*
- * After reading the highest device id from the IOMMU PCI capability header
- * this function looks if there is a higher device id defined in the ACPI table
- */
-static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
-{
- u8 *p = (void *)h, *end = (void *)h;
- struct ivhd_entry *dev;
-
- p += sizeof(*h);
- end += h->length;
-
- find_last_devid_on_pci(PCI_BUS(h->devid),
- PCI_SLOT(h->devid),
- PCI_FUNC(h->devid),
- h->cap_ptr);
-
- while (p < end) {
- dev = (struct ivhd_entry *)p;
- switch (dev->type) {
- case IVHD_DEV_SELECT:
- case IVHD_DEV_RANGE_END:
- case IVHD_DEV_ALIAS:
- case IVHD_DEV_EXT_SELECT:
- /* all the above subfield types refer to device ids */
- update_last_devid(dev->devid);
- break;
- default:
- break;
- }
- p += ivhd_entry_length(p);
- }
-
- WARN_ON(p != end);
-
- return 0;
-}
-
-/*
- * Iterate over all IVHD entries in the ACPI table and find the highest device
- * id which we need to handle. This is the first of three functions which parse
- * the ACPI table. So we check the checksum here.
- */
-static int __init find_last_devid_acpi(struct acpi_table_header *table)
-{
- int i;
- u8 checksum = 0, *p = (u8 *)table, *end = (u8 *)table;
- struct ivhd_header *h;
-
- /*
- * Validate checksum here so we don't need to do it when
- * we actually parse the table
- */
- for (i = 0; i < table->length; ++i)
- checksum += p[i];
- if (checksum != 0) {
- /* ACPI table corrupt */
- amd_iommu_init_err = -ENODEV;
- return 0;
- }
-
- p += IVRS_HEADER_LENGTH;
-
- end += table->length;
- while (p < end) {
- h = (struct ivhd_header *)p;
- switch (h->type) {
- case ACPI_IVHD_TYPE:
- find_last_devid_from_ivhd(h);
- break;
- default:
- break;
- }
- p += h->length;
- }
- WARN_ON(p != end);
-
- return 0;
-}
-
-/****************************************************************************
- *
- * The following functions belong the the code path which parses the ACPI table
- * the second time. In this ACPI parsing iteration we allocate IOMMU specific
- * data structures, initialize the device/alias/rlookup table and also
- * basically initialize the hardware.
- *
- ****************************************************************************/
-
-/*
- * Allocates the command buffer. This buffer is per AMD IOMMU. We can
- * write commands to that buffer later and the IOMMU will execute them
- * asynchronously
- */
-static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
-{
- u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
- get_order(CMD_BUFFER_SIZE));
-
- if (cmd_buf == NULL)
- return NULL;
-
- iommu->cmd_buf_size = CMD_BUFFER_SIZE | CMD_BUFFER_UNINITIALIZED;
-
- return cmd_buf;
-}
-
-/*
- * This function resets the command buffer if the IOMMU stopped fetching
- * commands from it.
- */
-void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu)
-{
- iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
-
- writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
- writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
-
- iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
-}
-
-/*
- * This function writes the command buffer address to the hardware and
- * enables it.
- */
-static void iommu_enable_command_buffer(struct amd_iommu *iommu)
-{
- u64 entry;
-
- BUG_ON(iommu->cmd_buf == NULL);
-
- entry = (u64)virt_to_phys(iommu->cmd_buf);
- entry |= MMIO_CMD_SIZE_512;
-
- memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
- &entry, sizeof(entry));
-
- amd_iommu_reset_cmd_buffer(iommu);
- iommu->cmd_buf_size &= ~(CMD_BUFFER_UNINITIALIZED);
-}
-
-static void __init free_command_buffer(struct amd_iommu *iommu)
-{
- free_pages((unsigned long)iommu->cmd_buf,
- get_order(iommu->cmd_buf_size & ~(CMD_BUFFER_UNINITIALIZED)));
-}
-
-/* allocates the memory where the IOMMU will log its events to */
-static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
-{
- iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
- get_order(EVT_BUFFER_SIZE));
-
- if (iommu->evt_buf == NULL)
- return NULL;
-
- iommu->evt_buf_size = EVT_BUFFER_SIZE;
-
- return iommu->evt_buf;
-}
-
-static void iommu_enable_event_buffer(struct amd_iommu *iommu)
-{
- u64 entry;
-
- BUG_ON(iommu->evt_buf == NULL);
-
- entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
-
- memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
- &entry, sizeof(entry));
-
- /* set head and tail to zero manually */
- writel(0x00, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
- writel(0x00, iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
-
- iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
-}
-
-static void __init free_event_buffer(struct amd_iommu *iommu)
-{
- free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE));
-}
-
-/* sets a specific bit in the device table entry. */
-static void set_dev_entry_bit(u16 devid, u8 bit)
-{
- int i = (bit >> 5) & 0x07;
- int _bit = bit & 0x1f;
-
- amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
-}
-
-static int get_dev_entry_bit(u16 devid, u8 bit)
-{
- int i = (bit >> 5) & 0x07;
- int _bit = bit & 0x1f;
-
- return (amd_iommu_dev_table[devid].data[i] & (1 << _bit)) >> _bit;
-}
-
-
-void amd_iommu_apply_erratum_63(u16 devid)
-{
- int sysmgt;
-
- sysmgt = get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1) |
- (get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2) << 1);
-
- if (sysmgt == 0x01)
- set_dev_entry_bit(devid, DEV_ENTRY_IW);
-}
-
-/* Writes the specific IOMMU for a device into the rlookup table */
-static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
-{
- amd_iommu_rlookup_table[devid] = iommu;
-}
-
-/*
- * This function takes the device specific flags read from the ACPI
- * table and sets up the device table entry with that information
- */
-static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
- u16 devid, u32 flags, u32 ext_flags)
-{
- if (flags & ACPI_DEVFLAG_INITPASS)
- set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
- if (flags & ACPI_DEVFLAG_EXTINT)
- set_dev_entry_bit(devid, DEV_ENTRY_EINT_PASS);
- if (flags & ACPI_DEVFLAG_NMI)
- set_dev_entry_bit(devid, DEV_ENTRY_NMI_PASS);
- if (flags & ACPI_DEVFLAG_SYSMGT1)
- set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1);
- if (flags & ACPI_DEVFLAG_SYSMGT2)
- set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2);
- if (flags & ACPI_DEVFLAG_LINT0)
- set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
- if (flags & ACPI_DEVFLAG_LINT1)
- set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
-
- amd_iommu_apply_erratum_63(devid);
-
- set_iommu_for_device(iommu, devid);
-}
-
-/*
- * Reads the device exclusion range from ACPI and initialize IOMMU with
- * it
- */
-static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
-{
- struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
-
- if (!(m->flags & IVMD_FLAG_EXCL_RANGE))
- return;
-
- if (iommu) {
- /*
- * We only can configure exclusion ranges per IOMMU, not
- * per device. But we can enable the exclusion range per
- * device. This is done here
- */
- set_dev_entry_bit(m->devid, DEV_ENTRY_EX);
- iommu->exclusion_start = m->range_start;
- iommu->exclusion_length = m->range_length;
- }
-}
-
-/*
- * This function reads some important data from the IOMMU PCI space and
- * initializes the driver data structure with it. It reads the hardware
- * capabilities and the first/last device entries
- */
-static void __init init_iommu_from_pci(struct amd_iommu *iommu)
-{
- int cap_ptr = iommu->cap_ptr;
- u32 range, misc;
- int i, j;
-
- pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
- &iommu->cap);
- pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET,
- &range);
- pci_read_config_dword(iommu->dev, cap_ptr + MMIO_MISC_OFFSET,
- &misc);
-
- iommu->first_device = calc_devid(MMIO_GET_BUS(range),
- MMIO_GET_FD(range));
- iommu->last_device = calc_devid(MMIO_GET_BUS(range),
- MMIO_GET_LD(range));
- iommu->evt_msi_num = MMIO_MSI_NUM(misc);
-
- if (!is_rd890_iommu(iommu->dev))
- return;
-
- /*
- * Some rd890 systems may not be fully reconfigured by the BIOS, so
- * it's necessary for us to store this information so it can be
- * reprogrammed on resume
- */
-
- pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4,
- &iommu->stored_addr_lo);
- pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8,
- &iommu->stored_addr_hi);
-
- /* Low bit locks writes to configuration space */
- iommu->stored_addr_lo &= ~1;
-
- for (i = 0; i < 6; i++)
- for (j = 0; j < 0x12; j++)
- iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j);
-
- for (i = 0; i < 0x83; i++)
- iommu->stored_l2[i] = iommu_read_l2(iommu, i);
-}
-
-/*
- * Takes a pointer to an AMD IOMMU entry in the ACPI table and
- * initializes the hardware and our data structures with it.
- */
-static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
- struct ivhd_header *h)
-{
- u8 *p = (u8 *)h;
- u8 *end = p, flags = 0;
- u16 dev_i, devid = 0, devid_start = 0, devid_to = 0;
- u32 ext_flags = 0;
- bool alias = false;
- struct ivhd_entry *e;
-
- /*
- * First save the recommended feature enable bits from ACPI
- */
- iommu->acpi_flags = h->flags;
-
- /*
- * Done. Now parse the device entries
- */
- p += sizeof(struct ivhd_header);
- end += h->length;
-
-
- while (p < end) {
- e = (struct ivhd_entry *)p;
- switch (e->type) {
- case IVHD_DEV_ALL:
-
- DUMP_printk(" DEV_ALL\t\t\t first devid: %02x:%02x.%x"
- " last device %02x:%02x.%x flags: %02x\n",
- PCI_BUS(iommu->first_device),
- PCI_SLOT(iommu->first_device),
- PCI_FUNC(iommu->first_device),
- PCI_BUS(iommu->last_device),
- PCI_SLOT(iommu->last_device),
- PCI_FUNC(iommu->last_device),
- e->flags);
-
- for (dev_i = iommu->first_device;
- dev_i <= iommu->last_device; ++dev_i)
- set_dev_entry_from_acpi(iommu, dev_i,
- e->flags, 0);
- break;
- case IVHD_DEV_SELECT:
-
- DUMP_printk(" DEV_SELECT\t\t\t devid: %02x:%02x.%x "
- "flags: %02x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid),
- e->flags);
-
- devid = e->devid;
- set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
- break;
- case IVHD_DEV_SELECT_RANGE_START:
-
- DUMP_printk(" DEV_SELECT_RANGE_START\t "
- "devid: %02x:%02x.%x flags: %02x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid),
- e->flags);
-
- devid_start = e->devid;
- flags = e->flags;
- ext_flags = 0;
- alias = false;
- break;
- case IVHD_DEV_ALIAS:
-
- DUMP_printk(" DEV_ALIAS\t\t\t devid: %02x:%02x.%x "
- "flags: %02x devid_to: %02x:%02x.%x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid),
- e->flags,
- PCI_BUS(e->ext >> 8),
- PCI_SLOT(e->ext >> 8),
- PCI_FUNC(e->ext >> 8));
-
- devid = e->devid;
- devid_to = e->ext >> 8;
- set_dev_entry_from_acpi(iommu, devid , e->flags, 0);
- set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0);
- amd_iommu_alias_table[devid] = devid_to;
- break;
- case IVHD_DEV_ALIAS_RANGE:
-
- DUMP_printk(" DEV_ALIAS_RANGE\t\t "
- "devid: %02x:%02x.%x flags: %02x "
- "devid_to: %02x:%02x.%x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid),
- e->flags,
- PCI_BUS(e->ext >> 8),
- PCI_SLOT(e->ext >> 8),
- PCI_FUNC(e->ext >> 8));
-
- devid_start = e->devid;
- flags = e->flags;
- devid_to = e->ext >> 8;
- ext_flags = 0;
- alias = true;
- break;
- case IVHD_DEV_EXT_SELECT:
-
- DUMP_printk(" DEV_EXT_SELECT\t\t devid: %02x:%02x.%x "
- "flags: %02x ext: %08x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid),
- e->flags, e->ext);
-
- devid = e->devid;
- set_dev_entry_from_acpi(iommu, devid, e->flags,
- e->ext);
- break;
- case IVHD_DEV_EXT_SELECT_RANGE:
-
- DUMP_printk(" DEV_EXT_SELECT_RANGE\t devid: "
- "%02x:%02x.%x flags: %02x ext: %08x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid),
- e->flags, e->ext);
-
- devid_start = e->devid;
- flags = e->flags;
- ext_flags = e->ext;
- alias = false;
- break;
- case IVHD_DEV_RANGE_END:
-
- DUMP_printk(" DEV_RANGE_END\t\t devid: %02x:%02x.%x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid));
-
- devid = e->devid;
- for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
- if (alias) {
- amd_iommu_alias_table[dev_i] = devid_to;
- set_dev_entry_from_acpi(iommu,
- devid_to, flags, ext_flags);
- }
- set_dev_entry_from_acpi(iommu, dev_i,
- flags, ext_flags);
- }
- break;
- default:
- break;
- }
-
- p += ivhd_entry_length(p);
- }
-}
-
-/* Initializes the device->iommu mapping for the driver */
-static int __init init_iommu_devices(struct amd_iommu *iommu)
-{
- u16 i;
-
- for (i = iommu->first_device; i <= iommu->last_device; ++i)
- set_iommu_for_device(iommu, i);
-
- return 0;
-}
-
-static void __init free_iommu_one(struct amd_iommu *iommu)
-{
- free_command_buffer(iommu);
- free_event_buffer(iommu);
- iommu_unmap_mmio_space(iommu);
-}
-
-static void __init free_iommu_all(void)
-{
- struct amd_iommu *iommu, *next;
-
- for_each_iommu_safe(iommu, next) {
- list_del(&iommu->list);
- free_iommu_one(iommu);
- kfree(iommu);
- }
-}
-
-/*
- * This function clues the initialization function for one IOMMU
- * together and also allocates the command buffer and programs the
- * hardware. It does NOT enable the IOMMU. This is done afterwards.
- */
-static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
-{
- spin_lock_init(&iommu->lock);
-
- /* Add IOMMU to internal data structures */
- list_add_tail(&iommu->list, &amd_iommu_list);
- iommu->index = amd_iommus_present++;
-
- if (unlikely(iommu->index >= MAX_IOMMUS)) {
- WARN(1, "AMD-Vi: System has more IOMMUs than supported by this driver\n");
- return -ENOSYS;
- }
-
- /* Index is fine - add IOMMU to the array */
- amd_iommus[iommu->index] = iommu;
-
- /*
- * Copy data from ACPI table entry to the iommu struct
- */
- iommu->dev = pci_get_bus_and_slot(PCI_BUS(h->devid), h->devid & 0xff);
- if (!iommu->dev)
- return 1;
-
- iommu->cap_ptr = h->cap_ptr;
- iommu->pci_seg = h->pci_seg;
- iommu->mmio_phys = h->mmio_phys;
- iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys);
- if (!iommu->mmio_base)
- return -ENOMEM;
-
- iommu->cmd_buf = alloc_command_buffer(iommu);
- if (!iommu->cmd_buf)
- return -ENOMEM;
-
- iommu->evt_buf = alloc_event_buffer(iommu);
- if (!iommu->evt_buf)
- return -ENOMEM;
-
- iommu->int_enabled = false;
-
- init_iommu_from_pci(iommu);
- init_iommu_from_acpi(iommu, h);
- init_iommu_devices(iommu);
-
- if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE))
- amd_iommu_np_cache = true;
-
- return pci_enable_device(iommu->dev);
-}
-
-/*
- * Iterates over all IOMMU entries in the ACPI table, allocates the
- * IOMMU structure and initializes it with init_iommu_one()
- */
-static int __init init_iommu_all(struct acpi_table_header *table)
-{
- u8 *p = (u8 *)table, *end = (u8 *)table;
- struct ivhd_header *h;
- struct amd_iommu *iommu;
- int ret;
-
- end += table->length;
- p += IVRS_HEADER_LENGTH;
-
- while (p < end) {
- h = (struct ivhd_header *)p;
- switch (*p) {
- case ACPI_IVHD_TYPE:
-
- DUMP_printk("device: %02x:%02x.%01x cap: %04x "
- "seg: %d flags: %01x info %04x\n",
- PCI_BUS(h->devid), PCI_SLOT(h->devid),
- PCI_FUNC(h->devid), h->cap_ptr,
- h->pci_seg, h->flags, h->info);
- DUMP_printk(" mmio-addr: %016llx\n",
- h->mmio_phys);
-
- iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
- if (iommu == NULL) {
- amd_iommu_init_err = -ENOMEM;
- return 0;
- }
-
- ret = init_iommu_one(iommu, h);
- if (ret) {
- amd_iommu_init_err = ret;
- return 0;
- }
- break;
- default:
- break;
- }
- p += h->length;
-
- }
- WARN_ON(p != end);
-
- return 0;
-}
-
-/****************************************************************************
- *
- * The following functions initialize the MSI interrupts for all IOMMUs
- * in the system. Its a bit challenging because there could be multiple
- * IOMMUs per PCI BDF but we can call pci_enable_msi(x) only once per
- * pci_dev.
- *
- ****************************************************************************/
-
-static int iommu_setup_msi(struct amd_iommu *iommu)
-{
- int r;
-
- if (pci_enable_msi(iommu->dev))
- return 1;
-
- r = request_irq(iommu->dev->irq, amd_iommu_int_handler,
- IRQF_SAMPLE_RANDOM,
- "AMD-Vi",
- NULL);
-
- if (r) {
- pci_disable_msi(iommu->dev);
- return 1;
- }
-
- iommu->int_enabled = true;
- iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
-
- return 0;
-}
-
-static int iommu_init_msi(struct amd_iommu *iommu)
-{
- if (iommu->int_enabled)
- return 0;
-
- if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
- return iommu_setup_msi(iommu);
-
- return 1;
-}
-
-/****************************************************************************
- *
- * The next functions belong to the third pass of parsing the ACPI
- * table. In this last pass the memory mapping requirements are
- * gathered (like exclusion and unity mapping reanges).
- *
- ****************************************************************************/
-
-static void __init free_unity_maps(void)
-{
- struct unity_map_entry *entry, *next;
-
- list_for_each_entry_safe(entry, next, &amd_iommu_unity_map, list) {
- list_del(&entry->list);
- kfree(entry);
- }
-}
-
-/* called when we find an exclusion range definition in ACPI */
-static int __init init_exclusion_range(struct ivmd_header *m)
-{
- int i;
-
- switch (m->type) {
- case ACPI_IVMD_TYPE:
- set_device_exclusion_range(m->devid, m);
- break;
- case ACPI_IVMD_TYPE_ALL:
- for (i = 0; i <= amd_iommu_last_bdf; ++i)
- set_device_exclusion_range(i, m);
- break;
- case ACPI_IVMD_TYPE_RANGE:
- for (i = m->devid; i <= m->aux; ++i)
- set_device_exclusion_range(i, m);
- break;
- default:
- break;
- }
-
- return 0;
-}
-
-/* called for unity map ACPI definition */
-static int __init init_unity_map_range(struct ivmd_header *m)
-{
- struct unity_map_entry *e = 0;
- char *s;
-
- e = kzalloc(sizeof(*e), GFP_KERNEL);
- if (e == NULL)
- return -ENOMEM;
-
- switch (m->type) {
- default:
- kfree(e);
- return 0;
- case ACPI_IVMD_TYPE:
- s = "IVMD_TYPEi\t\t\t";
- e->devid_start = e->devid_end = m->devid;
- break;
- case ACPI_IVMD_TYPE_ALL:
- s = "IVMD_TYPE_ALL\t\t";
- e->devid_start = 0;
- e->devid_end = amd_iommu_last_bdf;
- break;
- case ACPI_IVMD_TYPE_RANGE:
- s = "IVMD_TYPE_RANGE\t\t";
- e->devid_start = m->devid;
- e->devid_end = m->aux;
- break;
- }
- e->address_start = PAGE_ALIGN(m->range_start);
- e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
- e->prot = m->flags >> 1;
-
- DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x"
- " range_start: %016llx range_end: %016llx flags: %x\n", s,
- PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start),
- PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end),
- PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end),
- e->address_start, e->address_end, m->flags);
-
- list_add_tail(&e->list, &amd_iommu_unity_map);
-
- return 0;
-}
-
-/* iterates over all memory definitions we find in the ACPI table */
-static int __init init_memory_definitions(struct acpi_table_header *table)
-{
- u8 *p = (u8 *)table, *end = (u8 *)table;
- struct ivmd_header *m;
-
- end += table->length;
- p += IVRS_HEADER_LENGTH;
-
- while (p < end) {
- m = (struct ivmd_header *)p;
- if (m->flags & IVMD_FLAG_EXCL_RANGE)
- init_exclusion_range(m);
- else if (m->flags & IVMD_FLAG_UNITY_MAP)
- init_unity_map_range(m);
-
- p += m->length;
- }
-
- return 0;
-}
-
-/*
- * Init the device table to not allow DMA access for devices and
- * suppress all page faults
- */
-static void init_device_table(void)
-{
- u16 devid;
-
- for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
- set_dev_entry_bit(devid, DEV_ENTRY_VALID);
- set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION);
- }
-}
-
-static void iommu_init_flags(struct amd_iommu *iommu)
-{
- iommu->acpi_flags & IVHD_FLAG_HT_TUN_EN_MASK ?
- iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) :
- iommu_feature_disable(iommu, CONTROL_HT_TUN_EN);
-
- iommu->acpi_flags & IVHD_FLAG_PASSPW_EN_MASK ?
- iommu_feature_enable(iommu, CONTROL_PASSPW_EN) :
- iommu_feature_disable(iommu, CONTROL_PASSPW_EN);
-
- iommu->acpi_flags & IVHD_FLAG_RESPASSPW_EN_MASK ?
- iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) :
- iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN);
-
- iommu->acpi_flags & IVHD_FLAG_ISOC_EN_MASK ?
- iommu_feature_enable(iommu, CONTROL_ISOC_EN) :
- iommu_feature_disable(iommu, CONTROL_ISOC_EN);
-
- /*
- * make IOMMU memory accesses cache coherent
- */
- iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
-}
-
-static void iommu_apply_resume_quirks(struct amd_iommu *iommu)
-{
- int i, j;
- u32 ioc_feature_control;
- struct pci_dev *pdev = NULL;
-
- /* RD890 BIOSes may not have completely reconfigured the iommu */
- if (!is_rd890_iommu(iommu->dev))
- return;
-
- /*
- * First, we need to ensure that the iommu is enabled. This is
- * controlled by a register in the northbridge
- */
- pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0));
-
- if (!pdev)
- return;
-
- /* Select Northbridge indirect register 0x75 and enable writing */
- pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7));
- pci_read_config_dword(pdev, 0x64, &ioc_feature_control);
-
- /* Enable the iommu */
- if (!(ioc_feature_control & 0x1))
- pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1);
-
- pci_dev_put(pdev);
-
- /* Restore the iommu BAR */
- pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
- iommu->stored_addr_lo);
- pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8,
- iommu->stored_addr_hi);
-
- /* Restore the l1 indirect regs for each of the 6 l1s */
- for (i = 0; i < 6; i++)
- for (j = 0; j < 0x12; j++)
- iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]);
-
- /* Restore the l2 indirect regs */
- for (i = 0; i < 0x83; i++)
- iommu_write_l2(iommu, i, iommu->stored_l2[i]);
-
- /* Lock PCI setup registers */
- pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
- iommu->stored_addr_lo | 1);
-}
-
-/*
- * This function finally enables all IOMMUs found in the system after
- * they have been initialized
- */
-static void enable_iommus(void)
-{
- struct amd_iommu *iommu;
-
- for_each_iommu(iommu) {
- iommu_disable(iommu);
- iommu_init_flags(iommu);
- iommu_set_device_table(iommu);
- iommu_enable_command_buffer(iommu);
- iommu_enable_event_buffer(iommu);
- iommu_set_exclusion_range(iommu);
- iommu_init_msi(iommu);
- iommu_enable(iommu);
- }
-}
-
-static void disable_iommus(void)
-{
- struct amd_iommu *iommu;
-
- for_each_iommu(iommu)
- iommu_disable(iommu);
-}
-
-/*
- * Suspend/Resume support
- * disable suspend until real resume implemented
- */
-
-static void amd_iommu_resume(void)
-{
- struct amd_iommu *iommu;
-
- for_each_iommu(iommu)
- iommu_apply_resume_quirks(iommu);
-
- /* re-load the hardware */
- enable_iommus();
-
- /*
- * we have to flush after the IOMMUs are enabled because a
- * disabled IOMMU will never execute the commands we send
- */
- amd_iommu_flush_all_devices();
- amd_iommu_flush_all_domains();
-}
-
-static int amd_iommu_suspend(void)
-{
- /* disable IOMMUs to go out of the way for BIOS */
- disable_iommus();
-
- return 0;
-}
-
-static struct syscore_ops amd_iommu_syscore_ops = {
- .suspend = amd_iommu_suspend,
- .resume = amd_iommu_resume,
-};
-
-/*
- * This is the core init function for AMD IOMMU hardware in the system.
- * This function is called from the generic x86 DMA layer initialization
- * code.
- *
- * This function basically parses the ACPI table for AMD IOMMU (IVRS)
- * three times:
- *
- * 1 pass) Find the highest PCI device id the driver has to handle.
- * Upon this information the size of the data structures is
- * determined that needs to be allocated.
- *
- * 2 pass) Initialize the data structures just allocated with the
- * information in the ACPI table about available AMD IOMMUs
- * in the system. It also maps the PCI devices in the
- * system to specific IOMMUs
- *
- * 3 pass) After the basic data structures are allocated and
- * initialized we update them with information about memory
- * remapping requirements parsed out of the ACPI table in
- * this last pass.
- *
- * After that the hardware is initialized and ready to go. In the last
- * step we do some Linux specific things like registering the driver in
- * the dma_ops interface and initializing the suspend/resume support
- * functions. Finally it prints some information about AMD IOMMUs and
- * the driver state and enables the hardware.
- */
-static int __init amd_iommu_init(void)
-{
- int i, ret = 0;
-
- /*
- * First parse ACPI tables to find the largest Bus/Dev/Func
- * we need to handle. Upon this information the shared data
- * structures for the IOMMUs in the system will be allocated
- */
- if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
- return -ENODEV;
-
- ret = amd_iommu_init_err;
- if (ret)
- goto out;
-
- dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE);
- alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
- rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
-
- ret = -ENOMEM;
-
- /* Device table - directly used by all IOMMUs */
- amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
- get_order(dev_table_size));
- if (amd_iommu_dev_table == NULL)
- goto out;
-
- /*
- * Alias table - map PCI Bus/Dev/Func to Bus/Dev/Func the
- * IOMMU see for that device
- */
- amd_iommu_alias_table = (void *)__get_free_pages(GFP_KERNEL,
- get_order(alias_table_size));
- if (amd_iommu_alias_table == NULL)
- goto free;
-
- /* IOMMU rlookup table - find the IOMMU for a specific device */
- amd_iommu_rlookup_table = (void *)__get_free_pages(
- GFP_KERNEL | __GFP_ZERO,
- get_order(rlookup_table_size));
- if (amd_iommu_rlookup_table == NULL)
- goto free;
-
- amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
- GFP_KERNEL | __GFP_ZERO,
- get_order(MAX_DOMAIN_ID/8));
- if (amd_iommu_pd_alloc_bitmap == NULL)
- goto free;
-
- /* init the device table */
- init_device_table();
-
- /*
- * let all alias entries point to itself
- */
- for (i = 0; i <= amd_iommu_last_bdf; ++i)
- amd_iommu_alias_table[i] = i;
-
- /*
- * never allocate domain 0 because its used as the non-allocated and
- * error value placeholder
- */
- amd_iommu_pd_alloc_bitmap[0] = 1;
-
- spin_lock_init(&amd_iommu_pd_lock);
-
- /*
- * now the data structures are allocated and basically initialized
- * start the real acpi table scan
- */
- ret = -ENODEV;
- if (acpi_table_parse("IVRS", init_iommu_all) != 0)
- goto free;
-
- if (amd_iommu_init_err) {
- ret = amd_iommu_init_err;
- goto free;
- }
-
- if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
- goto free;
-
- if (amd_iommu_init_err) {
- ret = amd_iommu_init_err;
- goto free;
- }
-
- ret = amd_iommu_init_devices();
- if (ret)
- goto free;
-
- enable_iommus();
-
- if (iommu_pass_through)
- ret = amd_iommu_init_passthrough();
- else
- ret = amd_iommu_init_dma_ops();
-
- if (ret)
- goto free_disable;
-
- amd_iommu_init_api();
-
- amd_iommu_init_notifier();
-
- register_syscore_ops(&amd_iommu_syscore_ops);
-
- if (iommu_pass_through)
- goto out;
-
- if (amd_iommu_unmap_flush)
- printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n");
- else
- printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n");
-
- x86_platform.iommu_shutdown = disable_iommus;
-out:
- return ret;
-
-free_disable:
- disable_iommus();
-
-free:
- amd_iommu_uninit_devices();
-
- free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
- get_order(MAX_DOMAIN_ID/8));
-
- free_pages((unsigned long)amd_iommu_rlookup_table,
- get_order(rlookup_table_size));
-
- free_pages((unsigned long)amd_iommu_alias_table,
- get_order(alias_table_size));
-
- free_pages((unsigned long)amd_iommu_dev_table,
- get_order(dev_table_size));
-
- free_iommu_all();
-
- free_unity_maps();
-
-#ifdef CONFIG_GART_IOMMU
- /*
- * We failed to initialize the AMD IOMMU - try fallback to GART
- * if possible.
- */
- gart_iommu_init();
-
-#endif
-
- goto out;
-}
-
-/****************************************************************************
- *
- * Early detect code. This code runs at IOMMU detection time in the DMA
- * layer. It just looks if there is an IVRS ACPI table to detect AMD
- * IOMMUs
- *
- ****************************************************************************/
-static int __init early_amd_iommu_detect(struct acpi_table_header *table)
-{
- return 0;
-}
-
-int __init amd_iommu_detect(void)
-{
- if (no_iommu || (iommu_detected && !gart_iommu_aperture))
- return -ENODEV;
-
- if (amd_iommu_disabled)
- return -ENODEV;
-
- if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
- iommu_detected = 1;
- amd_iommu_detected = 1;
- x86_init.iommu.iommu_init = amd_iommu_init;
-
- /* Make sure ACS will be enabled */
- pci_request_acs();
- return 1;
- }
- return -ENODEV;
-}
-
-/****************************************************************************
- *
- * Parsing functions for the AMD IOMMU specific kernel command line
- * options.
- *
- ****************************************************************************/
-
-static int __init parse_amd_iommu_dump(char *str)
-{
- amd_iommu_dump = true;
-
- return 1;
-}
-
-static int __init parse_amd_iommu_options(char *str)
-{
- for (; *str; ++str) {
- if (strncmp(str, "fullflush", 9) == 0)
- amd_iommu_unmap_flush = true;
- if (strncmp(str, "off", 3) == 0)
- amd_iommu_disabled = true;
- }
-
- return 1;
-}
-
-__setup("amd_iommu_dump", parse_amd_iommu_dump);
-__setup("amd_iommu=", parse_amd_iommu_options);
-
-IOMMU_INIT_FINISH(amd_iommu_detect,
- gart_iommu_hole_init,
- 0,
- 0);
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index cd1ffed4ee2..afdc3f756de 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -27,15 +27,12 @@
* timer, but by default APB timer has higher rating than local APIC timers.
*/
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
#include <linux/delay.h>
+#include <linux/dw_apb_timer.h>
#include <linux/errno.h>
#include <linux/init.h>
-#include <linux/sysdev.h>
#include <linux/slab.h>
#include <linux/pm.h>
-#include <linux/pci.h>
#include <linux/sfi.h>
#include <linux/interrupt.h>
#include <linux/cpu.h>
@@ -44,76 +41,48 @@
#include <asm/fixmap.h>
#include <asm/apb_timer.h>
#include <asm/mrst.h>
+#include <asm/time.h>
-#define APBT_MASK CLOCKSOURCE_MASK(32)
-#define APBT_SHIFT 22
#define APBT_CLOCKEVENT_RATING 110
#define APBT_CLOCKSOURCE_RATING 250
-#define APBT_MIN_DELTA_USEC 200
-#define EVT_TO_APBT_DEV(evt) container_of(evt, struct apbt_dev, evt)
#define APBT_CLOCKEVENT0_NUM (0)
-#define APBT_CLOCKEVENT1_NUM (1)
#define APBT_CLOCKSOURCE_NUM (2)
-static unsigned long apbt_address;
+static phys_addr_t apbt_address;
static int apb_timer_block_enabled;
static void __iomem *apbt_virt_address;
-static int phy_cs_timer_id;
/*
* Common DW APB timer info
*/
-static uint64_t apbt_freq;
-
-static void apbt_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt);
-static int apbt_next_event(unsigned long delta,
- struct clock_event_device *evt);
-static cycle_t apbt_read_clocksource(struct clocksource *cs);
-static void apbt_restart_clocksource(struct clocksource *cs);
+static unsigned long apbt_freq;
struct apbt_dev {
- struct clock_event_device evt;
- unsigned int num;
- int cpu;
- unsigned int irq;
- unsigned int tick;
- unsigned int count;
- unsigned int flags;
- char name[10];
+ struct dw_apb_clock_event_device *timer;
+ unsigned int num;
+ int cpu;
+ unsigned int irq;
+ char name[10];
};
-static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
+static struct dw_apb_clocksource *clocksource_apbt;
-#ifdef CONFIG_SMP
-static unsigned int apbt_num_timers_used;
-static struct apbt_dev *apbt_devs;
-#endif
-
-static inline unsigned long apbt_readl_reg(unsigned long a)
+static inline void __iomem *adev_virt_addr(struct apbt_dev *adev)
{
- return readl(apbt_virt_address + a);
+ return apbt_virt_address + adev->num * APBTMRS_REG_SIZE;
}
-static inline void apbt_writel_reg(unsigned long d, unsigned long a)
-{
- writel(d, apbt_virt_address + a);
-}
-
-static inline unsigned long apbt_readl(int n, unsigned long a)
-{
- return readl(apbt_virt_address + a + n * APBTMRS_REG_SIZE);
-}
+static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
-static inline void apbt_writel(int n, unsigned long d, unsigned long a)
-{
- writel(d, apbt_virt_address + a + n * APBTMRS_REG_SIZE);
-}
+#ifdef CONFIG_SMP
+static unsigned int apbt_num_timers_used;
+#endif
static inline void apbt_set_mapping(void)
{
struct sfi_timer_table_entry *mtmr;
+ int phy_cs_timer_id = 0;
if (apbt_virt_address) {
pr_debug("APBT base already mapped\n");
@@ -125,21 +94,18 @@ static inline void apbt_set_mapping(void)
APBT_CLOCKEVENT0_NUM);
return;
}
- apbt_address = (unsigned long)mtmr->phys_addr;
+ apbt_address = (phys_addr_t)mtmr->phys_addr;
if (!apbt_address) {
printk(KERN_WARNING "No timer base from SFI, use default\n");
apbt_address = APBT_DEFAULT_BASE;
}
apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE);
- if (apbt_virt_address) {
- pr_debug("Mapped APBT physical addr %p at virtual addr %p\n",\
- (void *)apbt_address, (void *)apbt_virt_address);
- } else {
- pr_debug("Failed mapping APBT phy address at %p\n",\
- (void *)apbt_address);
+ if (!apbt_virt_address) {
+ pr_debug("Failed mapping APBT phy address at %lu\n",\
+ (unsigned long)apbt_address);
goto panic_noapbt;
}
- apbt_freq = mtmr->freq_hz / USEC_PER_SEC;
+ apbt_freq = mtmr->freq_hz;
sfi_free_mtmr(mtmr);
/* Now figure out the physical timer id for clocksource device */
@@ -148,9 +114,14 @@ static inline void apbt_set_mapping(void)
goto panic_noapbt;
/* Now figure out the physical timer id */
- phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff)
- / APBTMRS_REG_SIZE;
- pr_debug("Use timer %d for clocksource\n", phy_cs_timer_id);
+ pr_debug("Use timer %d for clocksource\n",
+ (int)(mtmr->phys_addr & 0xff) / APBTMRS_REG_SIZE);
+ phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) /
+ APBTMRS_REG_SIZE;
+
+ clocksource_apbt = dw_apb_clocksource_init(APBT_CLOCKSOURCE_RATING,
+ "apbt0", apbt_virt_address + phy_cs_timer_id *
+ APBTMRS_REG_SIZE, apbt_freq);
return;
panic_noapbt:
@@ -172,83 +143,6 @@ static inline int is_apbt_capable(void)
return apbt_virt_address ? 1 : 0;
}
-static struct clocksource clocksource_apbt = {
- .name = "apbt",
- .rating = APBT_CLOCKSOURCE_RATING,
- .read = apbt_read_clocksource,
- .mask = APBT_MASK,
- .shift = APBT_SHIFT,
- .flags = CLOCK_SOURCE_IS_CONTINUOUS,
- .resume = apbt_restart_clocksource,
-};
-
-/* boot APB clock event device */
-static struct clock_event_device apbt_clockevent = {
- .name = "apbt0",
- .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
- .set_mode = apbt_set_mode,
- .set_next_event = apbt_next_event,
- .shift = APBT_SHIFT,
- .irq = 0,
- .rating = APBT_CLOCKEVENT_RATING,
-};
-
-/*
- * start count down from 0xffff_ffff. this is done by toggling the enable bit
- * then load initial load count to ~0.
- */
-static void apbt_start_counter(int n)
-{
- unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
-
- ctrl &= ~APBTMR_CONTROL_ENABLE;
- apbt_writel(n, ctrl, APBTMR_N_CONTROL);
- apbt_writel(n, ~0, APBTMR_N_LOAD_COUNT);
- /* enable, mask interrupt */
- ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
- ctrl |= (APBTMR_CONTROL_ENABLE | APBTMR_CONTROL_INT);
- apbt_writel(n, ctrl, APBTMR_N_CONTROL);
- /* read it once to get cached counter value initialized */
- apbt_read_clocksource(&clocksource_apbt);
-}
-
-static irqreturn_t apbt_interrupt_handler(int irq, void *data)
-{
- struct apbt_dev *dev = (struct apbt_dev *)data;
- struct clock_event_device *aevt = &dev->evt;
-
- if (!aevt->event_handler) {
- printk(KERN_INFO "Spurious APBT timer interrupt on %d\n",
- dev->num);
- return IRQ_NONE;
- }
- aevt->event_handler(aevt);
- return IRQ_HANDLED;
-}
-
-static void apbt_restart_clocksource(struct clocksource *cs)
-{
- apbt_start_counter(phy_cs_timer_id);
-}
-
-static void apbt_enable_int(int n)
-{
- unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
- /* clear pending intr */
- apbt_readl(n, APBTMR_N_EOI);
- ctrl &= ~APBTMR_CONTROL_INT;
- apbt_writel(n, ctrl, APBTMR_N_CONTROL);
-}
-
-static void apbt_disable_int(int n)
-{
- unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
-
- ctrl |= APBTMR_CONTROL_INT;
- apbt_writel(n, ctrl, APBTMR_N_CONTROL);
-}
-
-
static int __init apbt_clockevent_register(void)
{
struct sfi_timer_table_entry *mtmr;
@@ -261,45 +155,21 @@ static int __init apbt_clockevent_register(void)
return -ENODEV;
}
- /*
- * We need to calculate the scaled math multiplication factor for
- * nanosecond to apbt tick conversion.
- * mult = (nsec/cycle)*2^APBT_SHIFT
- */
- apbt_clockevent.mult = div_sc((unsigned long) mtmr->freq_hz
- , NSEC_PER_SEC, APBT_SHIFT);
-
- /* Calculate the min / max delta */
- apbt_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
- &apbt_clockevent);
- apbt_clockevent.min_delta_ns = clockevent_delta2ns(
- APBT_MIN_DELTA_USEC*apbt_freq,
- &apbt_clockevent);
- /*
- * Start apbt with the boot cpu mask and make it
- * global if not used for per cpu timer.
- */
- apbt_clockevent.cpumask = cpumask_of(smp_processor_id());
adev->num = smp_processor_id();
- memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device));
+ adev->timer = dw_apb_clockevent_init(smp_processor_id(), "apbt0",
+ mrst_timer_options == MRST_TIMER_LAPIC_APBT ?
+ APBT_CLOCKEVENT_RATING - 100 : APBT_CLOCKEVENT_RATING,
+ adev_virt_addr(adev), 0, apbt_freq);
+ /* Firmware does EOI handling for us. */
+ adev->timer->eoi = NULL;
if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
- adev->evt.rating = APBT_CLOCKEVENT_RATING - 100;
- global_clock_event = &adev->evt;
+ global_clock_event = &adev->timer->ced;
printk(KERN_DEBUG "%s clockevent registered as global\n",
global_clock_event->name);
}
- if (request_irq(apbt_clockevent.irq, apbt_interrupt_handler,
- IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
- apbt_clockevent.name, adev)) {
- printk(KERN_ERR "Failed request IRQ for APBT%d\n",
- apbt_clockevent.irq);
- }
-
- clockevents_register_device(&adev->evt);
- /* Start APBT 0 interrupts */
- apbt_enable_int(APBT_CLOCKEVENT0_NUM);
+ dw_apb_clockevent_register(adev->timer);
sfi_free_mtmr(mtmr);
return 0;
@@ -317,52 +187,34 @@ static void apbt_setup_irq(struct apbt_dev *adev)
irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
/* APB timer irqs are set up as mp_irqs, timer is edge type */
__irq_set_handler(adev->irq, handle_edge_irq, 0, "edge");
-
- if (system_state == SYSTEM_BOOTING) {
- if (request_irq(adev->irq, apbt_interrupt_handler,
- IRQF_TIMER | IRQF_DISABLED |
- IRQF_NOBALANCING,
- adev->name, adev)) {
- printk(KERN_ERR "Failed request IRQ for APBT%d\n",
- adev->num);
- }
- } else
- enable_irq(adev->irq);
}
/* Should be called with per cpu */
void apbt_setup_secondary_clock(void)
{
struct apbt_dev *adev;
- struct clock_event_device *aevt;
int cpu;
/* Don't register boot CPU clockevent */
cpu = smp_processor_id();
if (!cpu)
return;
- /*
- * We need to calculate the scaled math multiplication factor for
- * nanosecond to apbt tick conversion.
- * mult = (nsec/cycle)*2^APBT_SHIFT
- */
- printk(KERN_INFO "Init per CPU clockevent %d\n", cpu);
- adev = &per_cpu(cpu_apbt_dev, cpu);
- aevt = &adev->evt;
- memcpy(aevt, &apbt_clockevent, sizeof(*aevt));
- aevt->cpumask = cpumask_of(cpu);
- aevt->name = adev->name;
- aevt->mode = CLOCK_EVT_MODE_UNUSED;
+ adev = &__get_cpu_var(cpu_apbt_dev);
+ if (!adev->timer) {
+ adev->timer = dw_apb_clockevent_init(cpu, adev->name,
+ APBT_CLOCKEVENT_RATING, adev_virt_addr(adev),
+ adev->irq, apbt_freq);
+ adev->timer->eoi = NULL;
+ } else {
+ dw_apb_clockevent_resume(adev->timer);
+ }
- printk(KERN_INFO "Registering CPU %d clockevent device %s, mask %08x\n",
- cpu, aevt->name, *(u32 *)aevt->cpumask);
+ printk(KERN_INFO "Registering CPU %d clockevent device %s, cpu %08x\n",
+ cpu, adev->name, adev->cpu);
apbt_setup_irq(adev);
-
- clockevents_register_device(aevt);
-
- apbt_enable_int(cpu);
+ dw_apb_clockevent_register(adev->timer);
return;
}
@@ -385,13 +237,12 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
switch (action & 0xf) {
case CPU_DEAD:
- disable_irq(adev->irq);
- apbt_disable_int(cpu);
+ dw_apb_clockevent_pause(adev->timer);
if (system_state == SYSTEM_RUNNING) {
pr_debug("skipping APBT CPU %lu offline\n", cpu);
} else if (adev) {
pr_debug("APBT clockevent for cpu %lu offline\n", cpu);
- free_irq(adev->irq, adev);
+ dw_apb_clockevent_stop(adev->timer);
}
break;
default:
@@ -416,116 +267,16 @@ void apbt_setup_secondary_clock(void) {}
#endif /* CONFIG_SMP */
-static void apbt_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt)
-{
- unsigned long ctrl;
- uint64_t delta;
- int timer_num;
- struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
-
- BUG_ON(!apbt_virt_address);
-
- timer_num = adev->num;
- pr_debug("%s CPU %d timer %d mode=%d\n",
- __func__, first_cpu(*evt->cpumask), timer_num, mode);
-
- switch (mode) {
- case CLOCK_EVT_MODE_PERIODIC:
- delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * apbt_clockevent.mult;
- delta >>= apbt_clockevent.shift;
- ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
- ctrl |= APBTMR_CONTROL_MODE_PERIODIC;
- apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
- /*
- * DW APB p. 46, have to disable timer before load counter,
- * may cause sync problem.
- */
- ctrl &= ~APBTMR_CONTROL_ENABLE;
- apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
- udelay(1);
- pr_debug("Setting clock period %d for HZ %d\n", (int)delta, HZ);
- apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT);
- ctrl |= APBTMR_CONTROL_ENABLE;
- apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
- break;
- /* APB timer does not have one-shot mode, use free running mode */
- case CLOCK_EVT_MODE_ONESHOT:
- ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
- /*
- * set free running mode, this mode will let timer reload max
- * timeout which will give time (3min on 25MHz clock) to rearm
- * the next event, therefore emulate the one-shot mode.
- */
- ctrl &= ~APBTMR_CONTROL_ENABLE;
- ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
-
- apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
- /* write again to set free running mode */
- apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-
- /*
- * DW APB p. 46, load counter with all 1s before starting free
- * running mode.
- */
- apbt_writel(timer_num, ~0, APBTMR_N_LOAD_COUNT);
- ctrl &= ~APBTMR_CONTROL_INT;
- ctrl |= APBTMR_CONTROL_ENABLE;
- apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
- break;
-
- case CLOCK_EVT_MODE_UNUSED:
- case CLOCK_EVT_MODE_SHUTDOWN:
- apbt_disable_int(timer_num);
- ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
- ctrl &= ~APBTMR_CONTROL_ENABLE;
- apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
- break;
-
- case CLOCK_EVT_MODE_RESUME:
- apbt_enable_int(timer_num);
- break;
- }
-}
-
-static int apbt_next_event(unsigned long delta,
- struct clock_event_device *evt)
-{
- unsigned long ctrl;
- int timer_num;
-
- struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
-
- timer_num = adev->num;
- /* Disable timer */
- ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
- ctrl &= ~APBTMR_CONTROL_ENABLE;
- apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
- /* write new count */
- apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT);
- ctrl |= APBTMR_CONTROL_ENABLE;
- apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
- return 0;
-}
-
-static cycle_t apbt_read_clocksource(struct clocksource *cs)
-{
- unsigned long current_count;
-
- current_count = apbt_readl(phy_cs_timer_id, APBTMR_N_CURRENT_VALUE);
- return (cycle_t)~current_count;
-}
-
static int apbt_clocksource_register(void)
{
u64 start, now;
cycle_t t1;
/* Start the counter, use timer 2 as source, timer 0/1 for event */
- apbt_start_counter(phy_cs_timer_id);
+ dw_apb_clocksource_start(clocksource_apbt);
/* Verify whether apbt counter works */
- t1 = apbt_read_clocksource(&clocksource_apbt);
+ t1 = dw_apb_clocksource_read(clocksource_apbt);
rdtscll(start);
/*
@@ -540,17 +291,10 @@ static int apbt_clocksource_register(void)
} while ((now - start) < 200000UL);
/* APBT is the only always on clocksource, it has to work! */
- if (t1 == apbt_read_clocksource(&clocksource_apbt))
+ if (t1 == dw_apb_clocksource_read(clocksource_apbt))
panic("APBT counter not counting. APBT disabled\n");
- /*
- * initialize and register APBT clocksource
- * convert that to ns/clock cycle
- * mult = (ns/c) * 2^APBT_SHIFT
- */
- clocksource_apbt.mult = div_sc(MSEC_PER_SEC,
- (unsigned long) apbt_freq, APBT_SHIFT);
- clocksource_register(&clocksource_apbt);
+ dw_apb_clocksource_register(clocksource_apbt);
return 0;
}
@@ -574,10 +318,7 @@ void __init apbt_time_init(void)
if (apb_timer_block_enabled)
return;
apbt_set_mapping();
- if (apbt_virt_address) {
- pr_debug("Found APBT version 0x%lx\n",\
- apbt_readl_reg(APBTMRS_COMP_VERSION));
- } else
+ if (!apbt_virt_address)
goto out_noapbt;
/*
* Read the frequency and check for a sane value, for ESL model
@@ -585,7 +326,7 @@ void __init apbt_time_init(void)
*/
if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) {
- pr_debug("APBT has invalid freq 0x%llx\n", apbt_freq);
+ pr_debug("APBT has invalid freq 0x%lx\n", apbt_freq);
goto out_noapbt;
}
if (apbt_clocksource_register()) {
@@ -611,30 +352,20 @@ void __init apbt_time_init(void)
} else {
percpu_timer = 0;
apbt_num_timers_used = 1;
- adev = &per_cpu(cpu_apbt_dev, 0);
- adev->flags &= ~APBT_DEV_USED;
}
pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used);
/* here we set up per CPU timer data structure */
- apbt_devs = kzalloc(sizeof(struct apbt_dev) * apbt_num_timers_used,
- GFP_KERNEL);
- if (!apbt_devs) {
- printk(KERN_ERR "Failed to allocate APB timer devices\n");
- return;
- }
for (i = 0; i < apbt_num_timers_used; i++) {
adev = &per_cpu(cpu_apbt_dev, i);
adev->num = i;
adev->cpu = i;
p_mtmr = sfi_get_mtmr(i);
- if (p_mtmr) {
- adev->tick = p_mtmr->freq_hz;
+ if (p_mtmr)
adev->irq = p_mtmr->irq;
- } else
+ else
printk(KERN_ERR "Failed to get timer for cpu %d\n", i);
- adev->count = 0;
- sprintf(adev->name, "apbt%d", i);
+ snprintf(adev->name, sizeof(adev->name) - 1, "apbt%d", i);
}
#endif
@@ -646,17 +377,8 @@ out_noapbt:
panic("failed to enable APB timer\n");
}
-static inline void apbt_disable(int n)
-{
- if (is_apbt_capable()) {
- unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
- ctrl &= ~APBTMR_CONTROL_ENABLE;
- apbt_writel(n, ctrl, APBTMR_N_CONTROL);
- }
-}
-
/* called before apb_timer_enable, use early map */
-unsigned long apbt_quick_calibrate()
+unsigned long apbt_quick_calibrate(void)
{
int i, scale;
u64 old, new;
@@ -665,31 +387,31 @@ unsigned long apbt_quick_calibrate()
u32 loop, shift;
apbt_set_mapping();
- apbt_start_counter(phy_cs_timer_id);
+ dw_apb_clocksource_start(clocksource_apbt);
/* check if the timer can count down, otherwise return */
- old = apbt_read_clocksource(&clocksource_apbt);
+ old = dw_apb_clocksource_read(clocksource_apbt);
i = 10000;
while (--i) {
- if (old != apbt_read_clocksource(&clocksource_apbt))
+ if (old != dw_apb_clocksource_read(clocksource_apbt))
break;
}
if (!i)
goto failed;
/* count 16 ms */
- loop = (apbt_freq * 1000) << 4;
+ loop = (apbt_freq / 1000) << 4;
/* restart the timer to ensure it won't get to 0 in the calibration */
- apbt_start_counter(phy_cs_timer_id);
+ dw_apb_clocksource_start(clocksource_apbt);
- old = apbt_read_clocksource(&clocksource_apbt);
+ old = dw_apb_clocksource_read(clocksource_apbt);
old += loop;
t1 = __native_read_tsc();
do {
- new = apbt_read_clocksource(&clocksource_apbt);
+ new = dw_apb_clocksource_read(clocksource_apbt);
} while (new < old);
t2 = __native_read_tsc();
@@ -701,7 +423,7 @@ unsigned long apbt_quick_calibrate()
return 0;
}
scale = (int)div_u64((t2 - t1), loop >> shift);
- khz = (scale * apbt_freq * 1000) >> shift;
+ khz = (scale * (apbt_freq / 1000)) >> shift;
printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz);
return khz;
failed:
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 86d1ad4962a..3d2661ca654 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -30,6 +30,22 @@
#include <asm/amd_nb.h>
#include <asm/x86_init.h>
+/*
+ * Using 512M as goal, in case kexec will load kernel_big
+ * that will do the on-position decompress, and could overlap with
+ * with the gart aperture that is used.
+ * Sequence:
+ * kernel_small
+ * ==> kexec (with kdump trigger path or gart still enabled)
+ * ==> kernel_small (gart area become e820_reserved)
+ * ==> kexec (with kdump trigger path or gart still enabled)
+ * ==> kerne_big (uncompressed size will be big than 64M or 128M)
+ * So don't use 512M below as gart iommu, leave the space for kernel
+ * code for safe.
+ */
+#define GART_MIN_ADDR (512ULL << 20)
+#define GART_MAX_ADDR (1ULL << 32)
+
int gart_iommu_aperture;
int gart_iommu_aperture_disabled __initdata;
int gart_iommu_aperture_allowed __initdata;
@@ -70,21 +86,9 @@ static u32 __init allocate_aperture(void)
* memory. Unfortunately we cannot move it up because that would
* make the IOMMU useless.
*/
- /*
- * using 512M as goal, in case kexec will load kernel_big
- * that will do the on position decompress, and could overlap with
- * that position with gart that is used.
- * sequende:
- * kernel_small
- * ==> kexec (with kdump trigger path or previous doesn't shutdown gart)
- * ==> kernel_small(gart area become e820_reserved)
- * ==> kexec (with kdump trigger path or previous doesn't shutdown gart)
- * ==> kerne_big (uncompressed size will be big than 64M or 128M)
- * so don't use 512M below as gart iommu, leave the space for kernel
- * code for safe
- */
- addr = memblock_find_in_range(0, 1ULL<<32, aper_size, 512ULL<<20);
- if (addr == MEMBLOCK_ERROR || addr + aper_size > 0xffffffff) {
+ addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR,
+ aper_size, aper_size);
+ if (addr == MEMBLOCK_ERROR || addr + aper_size > GART_MAX_ADDR) {
printk(KERN_ERR
"Cannot allocate aperture memory hole (%lx,%uK)\n",
addr, aper_size>>10);
@@ -499,7 +503,7 @@ out:
* Don't enable translation yet but enable GART IO and CPU
* accesses and set DISTLBWALKPRB since GART table memory is UC.
*/
- u32 ctl = DISTLBWALKPRB | aper_order << 1;
+ u32 ctl = aper_order << 1;
bus = amd_nb_bus_dev_ranges[i].bus;
dev_base = amd_nb_bus_dev_ranges[i].dev_base;
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 3966b564ea4..767fd04f284 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,20 +2,25 @@
# Makefile for local APIC drivers and for the IO-APIC code
#
-obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o
+obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o ipi.o
obj-y += hw_nmi.o
obj-$(CONFIG_X86_IO_APIC) += io_apic.o
obj-$(CONFIG_SMP) += ipi.o
ifeq ($(CONFIG_X86_64),y)
-obj-y += apic_flat_64.o
-obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o
-obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o
+# APIC probe will depend on the listing order here
obj-$(CONFIG_X86_UV) += x2apic_uv_x.o
+obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o
+obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o
+obj-y += apic_flat_64.o
endif
-obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o
+# APIC probe will depend on the listing order here
obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
-obj-$(CONFIG_X86_ES7000) += es7000_32.o
obj-$(CONFIG_X86_SUMMIT) += summit_32.o
+obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o
+obj-$(CONFIG_X86_ES7000) += es7000_32.o
+
+# For 32bit, probe_32 need to be listed last
+obj-$(CONFIG_X86_LOCAL_APIC) += probe_$(BITS).o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index fabf01eff77..52fa56399a5 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -27,6 +27,7 @@
#include <linux/syscore_ops.h>
#include <linux/delay.h>
#include <linux/timex.h>
+#include <linux/i8253.h>
#include <linux/dmar.h>
#include <linux/init.h>
#include <linux/cpu.h>
@@ -37,9 +38,8 @@
#include <asm/perf_event.h>
#include <asm/x86_init.h>
#include <asm/pgalloc.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <asm/mpspec.h>
-#include <asm/i8253.h>
#include <asm/i8259.h>
#include <asm/proto.h>
#include <asm/apic.h>
@@ -48,6 +48,7 @@
#include <asm/hpet.h>
#include <asm/idle.h>
#include <asm/mtrr.h>
+#include <asm/time.h>
#include <asm/smp.h>
#include <asm/mce.h>
#include <asm/tsc.h>
@@ -390,7 +391,8 @@ static unsigned int reserve_eilvt_offset(int offset, unsigned int new)
/*
* If mask=1, the LVT entry does not generate interrupts while mask=0
- * enables the vector. See also the BKDGs.
+ * enables the vector. See also the BKDGs. Must be called with
+ * preemption disabled.
*/
int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask)
@@ -505,7 +507,7 @@ static void __cpuinit setup_APIC_timer(void)
{
struct clock_event_device *levt = &__get_cpu_var(lapic_events);
- if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_ARAT)) {
+ if (this_cpu_has(X86_FEATURE_ARAT)) {
lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP;
/* Make LAPIC timer preferrable over percpu HPET */
lapic_clockevent.rating = 150;
@@ -1237,6 +1239,17 @@ void __cpuinit setup_local_APIC(void)
/* always use the value from LDR */
early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
logical_smp_processor_id();
+
+ /*
+ * Some NUMA implementations (NUMAQ) don't initialize apicid to
+ * node mapping during NUMA init. Now that logical apicid is
+ * guaranteed to be known, give it another chance. This is already
+ * a bit too late - percpu allocation has already happened without
+ * proper NUMA affinity.
+ */
+ if (apic->x86_32_numa_cpu_node)
+ set_apicid_to_node(early_per_cpu(x86_cpu_to_apicid, cpu),
+ apic->x86_32_numa_cpu_node(cpu));
#endif
/*
@@ -1417,7 +1430,7 @@ void enable_x2apic(void)
rdmsr(MSR_IA32_APICBASE, msr, msr2);
if (!(msr & X2APIC_ENABLE)) {
printk_once(KERN_INFO "Enabling x2apic\n");
- wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
+ wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, msr2);
}
}
#endif /* CONFIG_X86_X2APIC */
@@ -1450,7 +1463,6 @@ int __init enable_IR(void)
void __init enable_IR_x2apic(void)
{
unsigned long flags;
- struct IO_APIC_route_entry **ioapic_entries;
int ret, x2apic_enabled = 0;
int dmar_table_init_ret;
@@ -1458,13 +1470,7 @@ void __init enable_IR_x2apic(void)
if (dmar_table_init_ret && !x2apic_supported())
return;
- ioapic_entries = alloc_ioapic_entries();
- if (!ioapic_entries) {
- pr_err("Allocate ioapic_entries failed\n");
- goto out;
- }
-
- ret = save_IO_APIC_setup(ioapic_entries);
+ ret = save_ioapic_entries();
if (ret) {
pr_info("Saving IO-APIC state failed: %d\n", ret);
goto out;
@@ -1472,7 +1478,7 @@ void __init enable_IR_x2apic(void)
local_irq_save(flags);
legacy_pic->mask_all();
- mask_IO_APIC_setup(ioapic_entries);
+ mask_ioapic_entries();
if (dmar_table_init_ret)
ret = 0;
@@ -1503,14 +1509,11 @@ void __init enable_IR_x2apic(void)
nox2apic:
if (!ret) /* IR enabling failed */
- restore_IO_APIC_setup(ioapic_entries);
+ restore_ioapic_entries();
legacy_pic->restore_mask();
local_irq_restore(flags);
out:
- if (ioapic_entries)
- free_ioapic_entries(ioapic_entries);
-
if (x2apic_enabled)
return;
@@ -1812,30 +1815,41 @@ void smp_spurious_interrupt(struct pt_regs *regs)
*/
void smp_error_interrupt(struct pt_regs *regs)
{
- u32 v, v1;
+ u32 v0, v1;
+ u32 i = 0;
+ static const char * const error_interrupt_reason[] = {
+ "Send CS error", /* APIC Error Bit 0 */
+ "Receive CS error", /* APIC Error Bit 1 */
+ "Send accept error", /* APIC Error Bit 2 */
+ "Receive accept error", /* APIC Error Bit 3 */
+ "Redirectable IPI", /* APIC Error Bit 4 */
+ "Send illegal vector", /* APIC Error Bit 5 */
+ "Received illegal vector", /* APIC Error Bit 6 */
+ "Illegal register address", /* APIC Error Bit 7 */
+ };
exit_idle();
irq_enter();
/* First tickle the hardware, only then report what went on. -- REW */
- v = apic_read(APIC_ESR);
+ v0 = apic_read(APIC_ESR);
apic_write(APIC_ESR, 0);
v1 = apic_read(APIC_ESR);
ack_APIC_irq();
atomic_inc(&irq_err_count);
- /*
- * Here is what the APIC error bits mean:
- * 0: Send CS error
- * 1: Receive CS error
- * 2: Send accept error
- * 3: Receive accept error
- * 4: Reserved
- * 5: Send illegal vector
- * 6: Received illegal vector
- * 7: Illegal register address
- */
- pr_debug("APIC error on CPU%d: %02x(%02x)\n",
- smp_processor_id(), v , v1);
+ apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x(%02x)",
+ smp_processor_id(), v0 , v1);
+
+ v1 = v1 & 0xff;
+ while (v1) {
+ if (v1 & 0x1)
+ apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]);
+ i++;
+ v1 >>= 1;
+ };
+
+ apic_printk(APIC_DEBUG, KERN_CONT "\n");
+
irq_exit();
}
@@ -1930,10 +1944,28 @@ void disconnect_bsp_APIC(int virt_wire_setup)
void __cpuinit generic_processor_info(int apicid, int version)
{
- int cpu;
+ int cpu, max = nr_cpu_ids;
+ bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid,
+ phys_cpu_present_map);
+
+ /*
+ * If boot cpu has not been detected yet, then only allow upto
+ * nr_cpu_ids - 1 processors and keep one slot free for boot cpu
+ */
+ if (!boot_cpu_detected && num_processors >= nr_cpu_ids - 1 &&
+ apicid != boot_cpu_physical_apicid) {
+ int thiscpu = max + disabled_cpus - 1;
+
+ pr_warning(
+ "ACPI: NR_CPUS/possible_cpus limit of %i almost"
+ " reached. Keeping one slot for boot cpu."
+ " Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
+
+ disabled_cpus++;
+ return;
+ }
if (num_processors >= nr_cpu_ids) {
- int max = nr_cpu_ids;
int thiscpu = max + disabled_cpus;
pr_warning(
@@ -2003,21 +2035,6 @@ void default_init_apic_ldr(void)
apic_write(APIC_LDR, val);
}
-#ifdef CONFIG_X86_32
-int default_x86_32_numa_cpu_node(int cpu)
-{
-#ifdef CONFIG_NUMA
- int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
-
- if (apicid != BAD_APICID)
- return __apicid_to_node[apicid];
- return NUMA_NO_NODE;
-#else
- return 0;
-#endif
-}
-#endif
-
/*
* Power management
*/
@@ -2088,28 +2105,20 @@ static void lapic_resume(void)
{
unsigned int l, h;
unsigned long flags;
- int maxlvt, ret;
- struct IO_APIC_route_entry **ioapic_entries = NULL;
+ int maxlvt;
if (!apic_pm_state.active)
return;
local_irq_save(flags);
if (intr_remapping_enabled) {
- ioapic_entries = alloc_ioapic_entries();
- if (!ioapic_entries) {
- WARN(1, "Alloc ioapic_entries in lapic resume failed.");
- goto restore;
- }
-
- ret = save_IO_APIC_setup(ioapic_entries);
- if (ret) {
- WARN(1, "Saving IO-APIC state failed: %d\n", ret);
- free_ioapic_entries(ioapic_entries);
- goto restore;
- }
-
- mask_IO_APIC_setup(ioapic_entries);
+ /*
+ * IO-APIC and PIC have their own resume routines.
+ * We just mask them here to make sure the interrupt
+ * subsystem is completely quiet while we enable x2apic
+ * and interrupt-remapping.
+ */
+ mask_ioapic_entries();
legacy_pic->mask_all();
}
@@ -2152,13 +2161,9 @@ static void lapic_resume(void)
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
- if (intr_remapping_enabled) {
+ if (intr_remapping_enabled)
reenable_intr_remapping(x2apic_mode);
- legacy_pic->restore_mask();
- restore_IO_APIC_setup(ioapic_entries);
- free_ioapic_entries(ioapic_entries);
- }
-restore:
+
local_irq_restore(flags);
}
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 5652d31fe10..f7a41e4cae4 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -16,6 +16,7 @@
#include <linux/ctype.h>
#include <linux/init.h>
#include <linux/hardirq.h>
+#include <linux/module.h>
#include <asm/smp.h>
#include <asm/apic.h>
#include <asm/ipi.h>
@@ -24,6 +25,12 @@
#include <acpi/acpi_bus.h>
#endif
+static struct apic apic_physflat;
+static struct apic apic_flat;
+
+struct apic __read_mostly *apic = &apic_flat;
+EXPORT_SYMBOL_GPL(apic);
+
static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
return 1;
@@ -164,7 +171,7 @@ static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
return initial_apic_id >> index_msb;
}
-struct apic apic_flat = {
+static struct apic apic_flat = {
.name = "flat",
.probe = NULL,
.acpi_madt_oem_check = flat_acpi_madt_oem_check,
@@ -312,10 +319,18 @@ physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
return per_cpu(x86_cpu_to_apicid, cpu);
}
-struct apic apic_physflat = {
+static int physflat_probe(void)
+{
+ if (apic == &apic_physflat || num_possible_cpus() > 8)
+ return 1;
+
+ return 0;
+}
+
+static struct apic apic_physflat = {
.name = "physical flat",
- .probe = NULL,
+ .probe = physflat_probe,
.acpi_madt_oem_check = physflat_acpi_madt_oem_check,
.apic_id_registered = flat_apic_id_registered,
@@ -369,3 +384,8 @@ struct apic apic_physflat = {
.wait_icr_idle = native_apic_wait_icr_idle,
.safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
};
+
+/*
+ * We need to check for physflat first, so this order is important.
+ */
+apic_drivers(apic_physflat, apic_flat);
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index f1baa2dc087..775b82bc655 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -119,14 +119,6 @@ static void noop_apic_write(u32 reg, u32 v)
WARN_ON_ONCE(cpu_has_apic && !disable_apic);
}
-#ifdef CONFIG_X86_32
-static int noop_x86_32_numa_cpu_node(int cpu)
-{
- /* we're always on node 0 */
- return 0;
-}
-#endif
-
struct apic apic_noop = {
.name = "noop",
.probe = noop_probe,
@@ -195,6 +187,5 @@ struct apic apic_noop = {
#ifdef CONFIG_X86_32
.x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid,
- .x86_32_numa_cpu_node = noop_x86_32_numa_cpu_node,
#endif
};
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 541a2e43165..efd737e827f 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -193,7 +193,7 @@ static int probe_bigsmp(void)
return dmi_bigsmp;
}
-struct apic apic_bigsmp = {
+static struct apic apic_bigsmp = {
.name = "bigsmp",
.probe = probe_bigsmp,
@@ -253,5 +253,14 @@ struct apic apic_bigsmp = {
.safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
.x86_32_early_logical_apicid = bigsmp_early_logical_apicid,
- .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node,
};
+
+struct apic * __init generic_bigsmp_probe(void)
+{
+ if (probe_bigsmp())
+ return &apic_bigsmp;
+
+ return NULL;
+}
+
+apic_driver(apic_bigsmp);
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 3e9de4854c5..5d513bc47b6 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -48,7 +48,7 @@
#include <linux/io.h>
#include <asm/apicdef.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <asm/fixmap.h>
#include <asm/mpspec.h>
#include <asm/setup.h>
@@ -510,11 +510,6 @@ static void es7000_setup_apic_routing(void)
nr_ioapics, cpumask_bits(es7000_target_cpus())[0]);
}
-static int es7000_numa_cpu_node(int cpu)
-{
- return 0;
-}
-
static int es7000_cpu_present_to_apicid(int mps_cpu)
{
if (!mps_cpu)
@@ -625,7 +620,7 @@ static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem,
}
/* We've been warned by a false positive warning.Use __refdata to keep calm. */
-struct apic __refdata apic_es7000_cluster = {
+static struct apic __refdata apic_es7000_cluster = {
.name = "es7000",
.probe = probe_es7000,
@@ -688,10 +683,9 @@ struct apic __refdata apic_es7000_cluster = {
.safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
.x86_32_early_logical_apicid = es7000_early_logical_apicid,
- .x86_32_numa_cpu_node = es7000_numa_cpu_node,
};
-struct apic __refdata apic_es7000 = {
+static struct apic __refdata apic_es7000 = {
.name = "es7000",
.probe = probe_es7000,
@@ -752,5 +746,10 @@ struct apic __refdata apic_es7000 = {
.safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
.x86_32_early_logical_apicid = es7000_early_logical_apicid,
- .x86_32_numa_cpu_node = es7000_numa_cpu_node,
};
+
+/*
+ * Need to check for es7000 followed by es7000_cluster, so this order
+ * in apic_drivers is important.
+ */
+apic_drivers(apic_es7000, apic_es7000_cluster);
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 5260fe91bcb..d5e57db0f7b 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -19,9 +19,9 @@
#include <linux/delay.h>
#ifdef CONFIG_HARDLOCKUP_DETECTOR
-u64 hw_nmi_get_sample_period(void)
+u64 hw_nmi_get_sample_period(int watchdog_thresh)
{
- return (u64)(cpu_khz) * 1000 * 60;
+ return (u64)(cpu_khz) * 1000 * watchdog_thresh;
}
#endif
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 68df09bba92..8eb863e27ea 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -76,17 +76,40 @@ int sis_apic_bug = -1;
static DEFINE_RAW_SPINLOCK(ioapic_lock);
static DEFINE_RAW_SPINLOCK(vector_lock);
-/*
- * # of IRQ routing registers
- */
-int nr_ioapic_registers[MAX_IO_APICS];
+static struct ioapic {
+ /*
+ * # of IRQ routing registers
+ */
+ int nr_registers;
+ /*
+ * Saved state during suspend/resume, or while enabling intr-remap.
+ */
+ struct IO_APIC_route_entry *saved_registers;
+ /* I/O APIC config */
+ struct mpc_ioapic mp_config;
+ /* IO APIC gsi routing info */
+ struct mp_ioapic_gsi gsi_config;
+ DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
+} ioapics[MAX_IO_APICS];
-/* I/O APIC entries */
-struct mpc_ioapic mp_ioapics[MAX_IO_APICS];
-int nr_ioapics;
+#define mpc_ioapic_ver(id) ioapics[id].mp_config.apicver
+
+int mpc_ioapic_id(int id)
+{
+ return ioapics[id].mp_config.apicid;
+}
-/* IO APIC gsi routing info */
-struct mp_ioapic_gsi mp_gsi_routing[MAX_IO_APICS];
+unsigned int mpc_ioapic_addr(int id)
+{
+ return ioapics[id].mp_config.apicaddr;
+}
+
+struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int id)
+{
+ return &ioapics[id].gsi_config;
+}
+
+int nr_ioapics;
/* The one past the highest gsi number used */
u32 gsi_top;
@@ -128,8 +151,8 @@ static int __init parse_noapic(char *str)
}
early_param("noapic", parse_noapic);
-static int io_apic_setup_irq_pin_once(unsigned int irq, int node,
- struct io_apic_irq_attr *attr);
+static int io_apic_setup_irq_pin(unsigned int irq, int node,
+ struct io_apic_irq_attr *attr);
/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */
void mp_save_irq(struct mpc_intsrc *m)
@@ -179,6 +202,14 @@ int __init arch_early_irq_init(void)
io_apic_irqs = ~0UL;
}
+ for (i = 0; i < nr_ioapics; i++) {
+ ioapics[i].saved_registers =
+ kzalloc(sizeof(struct IO_APIC_route_entry) *
+ ioapics[i].nr_registers, GFP_KERNEL);
+ if (!ioapics[i].saved_registers)
+ pr_err("IOAPIC %d: suspend/resume impossible!\n", i);
+ }
+
cfg = irq_cfgx;
count = ARRAY_SIZE(irq_cfgx);
node = cpu_to_node(0);
@@ -297,7 +328,7 @@ struct io_apic {
static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
{
return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
- + (mp_ioapics[idx].apicaddr & ~PAGE_MASK);
+ + (mpc_ioapic_addr(idx) & ~PAGE_MASK);
}
static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
@@ -573,7 +604,7 @@ static void clear_IO_APIC (void)
int apic, pin;
for (apic = 0; apic < nr_ioapics; apic++)
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
+ for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
clear_IO_APIC_pin(apic, pin);
}
@@ -615,74 +646,43 @@ static int __init ioapic_pirq_setup(char *str)
__setup("pirq=", ioapic_pirq_setup);
#endif /* CONFIG_X86_32 */
-struct IO_APIC_route_entry **alloc_ioapic_entries(void)
-{
- int apic;
- struct IO_APIC_route_entry **ioapic_entries;
-
- ioapic_entries = kzalloc(sizeof(*ioapic_entries) * nr_ioapics,
- GFP_KERNEL);
- if (!ioapic_entries)
- return 0;
-
- for (apic = 0; apic < nr_ioapics; apic++) {
- ioapic_entries[apic] =
- kzalloc(sizeof(struct IO_APIC_route_entry) *
- nr_ioapic_registers[apic], GFP_KERNEL);
- if (!ioapic_entries[apic])
- goto nomem;
- }
-
- return ioapic_entries;
-
-nomem:
- while (--apic >= 0)
- kfree(ioapic_entries[apic]);
- kfree(ioapic_entries);
-
- return 0;
-}
-
/*
* Saves all the IO-APIC RTE's
*/
-int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
+int save_ioapic_entries(void)
{
int apic, pin;
-
- if (!ioapic_entries)
- return -ENOMEM;
+ int err = 0;
for (apic = 0; apic < nr_ioapics; apic++) {
- if (!ioapic_entries[apic])
- return -ENOMEM;
+ if (!ioapics[apic].saved_registers) {
+ err = -ENOMEM;
+ continue;
+ }
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
- ioapic_entries[apic][pin] =
+ for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
+ ioapics[apic].saved_registers[pin] =
ioapic_read_entry(apic, pin);
}
- return 0;
+ return err;
}
/*
* Mask all IO APIC entries.
*/
-void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
+void mask_ioapic_entries(void)
{
int apic, pin;
- if (!ioapic_entries)
- return;
-
for (apic = 0; apic < nr_ioapics; apic++) {
- if (!ioapic_entries[apic])
- break;
+ if (!ioapics[apic].saved_registers)
+ continue;
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+ for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
struct IO_APIC_route_entry entry;
- entry = ioapic_entries[apic][pin];
+ entry = ioapics[apic].saved_registers[pin];
if (!entry.mask) {
entry.mask = 1;
ioapic_write_entry(apic, pin, entry);
@@ -692,36 +692,23 @@ void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
}
/*
- * Restore IO APIC entries which was saved in ioapic_entries.
+ * Restore IO APIC entries which was saved in the ioapic structure.
*/
-int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
+int restore_ioapic_entries(void)
{
int apic, pin;
- if (!ioapic_entries)
- return -ENOMEM;
-
for (apic = 0; apic < nr_ioapics; apic++) {
- if (!ioapic_entries[apic])
- return -ENOMEM;
+ if (!ioapics[apic].saved_registers)
+ continue;
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
+ for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
ioapic_write_entry(apic, pin,
- ioapic_entries[apic][pin]);
+ ioapics[apic].saved_registers[pin]);
}
return 0;
}
-void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
-{
- int apic;
-
- for (apic = 0; apic < nr_ioapics; apic++)
- kfree(ioapic_entries[apic]);
-
- kfree(ioapic_entries);
-}
-
/*
* Find the IRQ entry number of a certain pin.
*/
@@ -731,7 +718,7 @@ static int find_irq_entry(int apic, int pin, int type)
for (i = 0; i < mp_irq_entries; i++)
if (mp_irqs[i].irqtype == type &&
- (mp_irqs[i].dstapic == mp_ioapics[apic].apicid ||
+ (mp_irqs[i].dstapic == mpc_ioapic_id(apic) ||
mp_irqs[i].dstapic == MP_APIC_ALL) &&
mp_irqs[i].dstirq == pin)
return i;
@@ -773,7 +760,7 @@ static int __init find_isa_irq_apic(int irq, int type)
if (i < mp_irq_entries) {
int apic;
for(apic = 0; apic < nr_ioapics; apic++) {
- if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic)
+ if (mpc_ioapic_id(apic) == mp_irqs[i].dstapic)
return apic;
}
}
@@ -942,6 +929,7 @@ static int pin_2_irq(int idx, int apic, int pin)
{
int irq;
int bus = mp_irqs[idx].srcbus;
+ struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(apic);
/*
* Debugging check, we are in big trouble if this message pops up!
@@ -952,7 +940,7 @@ static int pin_2_irq(int idx, int apic, int pin)
if (test_bit(bus, mp_bus_not_pci)) {
irq = mp_irqs[idx].srcbusirq;
} else {
- u32 gsi = mp_gsi_routing[apic].gsi_base + pin;
+ u32 gsi = gsi_cfg->gsi_base + pin;
if (gsi >= NR_IRQS_LEGACY)
irq = gsi;
@@ -1003,7 +991,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
int lbus = mp_irqs[i].srcbus;
for (apic = 0; apic < nr_ioapics; apic++)
- if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
+ if (mpc_ioapic_id(apic) == mp_irqs[i].dstapic ||
mp_irqs[i].dstapic == MP_APIC_ALL)
break;
@@ -1222,7 +1210,7 @@ static inline int IO_APIC_irq_trigger(int irq)
int apic, idx, pin;
for (apic = 0; apic < nr_ioapics; apic++) {
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+ for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
idx = find_irq_entry(apic, pin, mp_INT);
if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
return irq_trigger(idx);
@@ -1307,6 +1295,16 @@ static int setup_ioapic_entry(int apic_id, int irq,
* irq handler will do the explicit EOI to the io-apic.
*/
ir_entry->vector = pin;
+
+ apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: "
+ "Set IRTE entry (P:%d FPD:%d Dst_Mode:%d "
+ "Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X "
+ "Avail:%X Vector:%02X Dest:%08X "
+ "SID:%04X SQ:%X SVT:%X)\n",
+ apic_id, irte.present, irte.fpd, irte.dst_mode,
+ irte.redir_hint, irte.trigger_mode, irte.dlvry_mode,
+ irte.avail, irte.vector, irte.dest_id,
+ irte.sid, irte.sq, irte.svt);
} else {
entry->delivery_mode = apic->irq_delivery_mode;
entry->dest_mode = apic->irq_dest_mode;
@@ -1349,15 +1347,15 @@ static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq,
apic_printk(APIC_VERBOSE,KERN_DEBUG
"IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
- "IRQ %d Mode:%i Active:%i)\n",
- apic_id, mp_ioapics[apic_id].apicid, pin, cfg->vector,
- irq, trigger, polarity);
+ "IRQ %d Mode:%i Active:%i Dest:%d)\n",
+ apic_id, mpc_ioapic_id(apic_id), pin, cfg->vector,
+ irq, trigger, polarity, dest);
- if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry,
+ if (setup_ioapic_entry(mpc_ioapic_id(apic_id), irq, &entry,
dest, trigger, polarity, cfg->vector, pin)) {
printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
- mp_ioapics[apic_id].apicid, pin);
+ mpc_ioapic_id(apic_id), pin);
__clear_irq_vector(irq, cfg);
return;
}
@@ -1369,17 +1367,13 @@ static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq,
ioapic_write_entry(apic_id, pin, entry);
}
-static struct {
- DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
-} mp_ioapic_routing[MAX_IO_APICS];
-
static bool __init io_apic_pin_not_connected(int idx, int apic_id, int pin)
{
if (idx != -1)
return false;
apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n",
- mp_ioapics[apic_id].apicid, pin);
+ mpc_ioapic_id(apic_id), pin);
return true;
}
@@ -1389,7 +1383,7 @@ static void __init __io_apic_setup_irqs(unsigned int apic_id)
struct io_apic_irq_attr attr;
unsigned int pin, irq;
- for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
+ for (pin = 0; pin < ioapics[apic_id].nr_registers; pin++) {
idx = find_irq_entry(apic_id, pin, mp_INT);
if (io_apic_pin_not_connected(idx, apic_id, pin))
continue;
@@ -1511,7 +1505,7 @@ __apicdebuginit(void) print_IO_APIC(void)
printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
for (i = 0; i < nr_ioapics; i++)
printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
- mp_ioapics[i].apicid, nr_ioapic_registers[i]);
+ mpc_ioapic_id(i), ioapics[i].nr_registers);
/*
* We are a bit conservative about what we expect. We have to
@@ -1531,17 +1525,19 @@ __apicdebuginit(void) print_IO_APIC(void)
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
printk("\n");
- printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid);
+ printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(apic));
printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
- printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
+ printk(KERN_DEBUG "....... : max redirection entries: %02X\n",
+ reg_01.bits.entries);
printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
- printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
+ printk(KERN_DEBUG "....... : IO APIC version: %02X\n",
+ reg_01.bits.version);
/*
* Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
@@ -1566,31 +1562,60 @@ __apicdebuginit(void) print_IO_APIC(void)
printk(KERN_DEBUG ".... IRQ redirection table:\n");
- printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
- " Stat Dmod Deli Vect:\n");
+ if (intr_remapping_enabled) {
+ printk(KERN_DEBUG " NR Indx Fmt Mask Trig IRR"
+ " Pol Stat Indx2 Zero Vect:\n");
+ } else {
+ printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
+ " Stat Dmod Deli Vect:\n");
+ }
for (i = 0; i <= reg_01.bits.entries; i++) {
- struct IO_APIC_route_entry entry;
-
- entry = ioapic_read_entry(apic, i);
-
- printk(KERN_DEBUG " %02x %03X ",
- i,
- entry.dest
- );
+ if (intr_remapping_enabled) {
+ struct IO_APIC_route_entry entry;
+ struct IR_IO_APIC_route_entry *ir_entry;
+
+ entry = ioapic_read_entry(apic, i);
+ ir_entry = (struct IR_IO_APIC_route_entry *) &entry;
+ printk(KERN_DEBUG " %02x %04X ",
+ i,
+ ir_entry->index
+ );
+ printk("%1d %1d %1d %1d %1d "
+ "%1d %1d %X %02X\n",
+ ir_entry->format,
+ ir_entry->mask,
+ ir_entry->trigger,
+ ir_entry->irr,
+ ir_entry->polarity,
+ ir_entry->delivery_status,
+ ir_entry->index2,
+ ir_entry->zero,
+ ir_entry->vector
+ );
+ } else {
+ struct IO_APIC_route_entry entry;
- printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
- entry.mask,
- entry.trigger,
- entry.irr,
- entry.polarity,
- entry.delivery_status,
- entry.dest_mode,
- entry.delivery_mode,
- entry.vector
- );
+ entry = ioapic_read_entry(apic, i);
+ printk(KERN_DEBUG " %02x %02X ",
+ i,
+ entry.dest
+ );
+ printk("%1d %1d %1d %1d %1d "
+ "%1d %1d %02X\n",
+ entry.mask,
+ entry.trigger,
+ entry.irr,
+ entry.polarity,
+ entry.delivery_status,
+ entry.dest_mode,
+ entry.delivery_mode,
+ entry.vector
+ );
+ }
}
}
+
printk(KERN_DEBUG "IRQ to pin mappings:\n");
for_each_active_irq(irq) {
struct irq_pin_list *entry;
@@ -1808,7 +1833,7 @@ __apicdebuginit(int) print_ICs(void)
return 0;
}
-fs_initcall(print_ICs);
+late_initcall(print_ICs);
/* Where if anywhere is the i8259 connect in external int mode */
@@ -1825,7 +1850,7 @@ void __init enable_IO_APIC(void)
for(apic = 0; apic < nr_ioapics; apic++) {
int pin;
/* See if any of the pins is in ExtINT mode */
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+ for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
struct IO_APIC_route_entry entry;
entry = ioapic_read_entry(apic, pin);
@@ -1949,14 +1974,14 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
reg_00.raw = io_apic_read(apic_id, 0);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
- old_id = mp_ioapics[apic_id].apicid;
+ old_id = mpc_ioapic_id(apic_id);
- if (mp_ioapics[apic_id].apicid >= get_physical_broadcast()) {
+ if (mpc_ioapic_id(apic_id) >= get_physical_broadcast()) {
printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
- apic_id, mp_ioapics[apic_id].apicid);
+ apic_id, mpc_ioapic_id(apic_id));
printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
reg_00.bits.ID);
- mp_ioapics[apic_id].apicid = reg_00.bits.ID;
+ ioapics[apic_id].mp_config.apicid = reg_00.bits.ID;
}
/*
@@ -1965,9 +1990,9 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
* 'stuck on smp_invalidate_needed IPI wait' messages.
*/
if (apic->check_apicid_used(&phys_id_present_map,
- mp_ioapics[apic_id].apicid)) {
+ mpc_ioapic_id(apic_id))) {
printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
- apic_id, mp_ioapics[apic_id].apicid);
+ apic_id, mpc_ioapic_id(apic_id));
for (i = 0; i < get_physical_broadcast(); i++)
if (!physid_isset(i, phys_id_present_map))
break;
@@ -1976,13 +2001,14 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
i);
physid_set(i, phys_id_present_map);
- mp_ioapics[apic_id].apicid = i;
+ ioapics[apic_id].mp_config.apicid = i;
} else {
physid_mask_t tmp;
- apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid, &tmp);
+ apic->apicid_to_cpu_present(mpc_ioapic_id(apic_id),
+ &tmp);
apic_printk(APIC_VERBOSE, "Setting %d in the "
"phys_id_present_map\n",
- mp_ioapics[apic_id].apicid);
+ mpc_ioapic_id(apic_id));
physids_or(phys_id_present_map, phys_id_present_map, tmp);
}
@@ -1990,24 +2016,24 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
* We need to adjust the IRQ routing table
* if the ID changed.
*/
- if (old_id != mp_ioapics[apic_id].apicid)
+ if (old_id != mpc_ioapic_id(apic_id))
for (i = 0; i < mp_irq_entries; i++)
if (mp_irqs[i].dstapic == old_id)
mp_irqs[i].dstapic
- = mp_ioapics[apic_id].apicid;
+ = mpc_ioapic_id(apic_id);
/*
* Update the ID register according to the right value
* from the MPC table if they are different.
*/
- if (mp_ioapics[apic_id].apicid == reg_00.bits.ID)
+ if (mpc_ioapic_id(apic_id) == reg_00.bits.ID)
continue;
apic_printk(APIC_VERBOSE, KERN_INFO
"...changing IO-APIC physical APIC ID to %d ...",
- mp_ioapics[apic_id].apicid);
+ mpc_ioapic_id(apic_id));
- reg_00.bits.ID = mp_ioapics[apic_id].apicid;
+ reg_00.bits.ID = mpc_ioapic_id(apic_id);
raw_spin_lock_irqsave(&ioapic_lock, flags);
io_apic_write(apic_id, 0, reg_00.raw);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -2018,7 +2044,7 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
raw_spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(apic_id, 0);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
- if (reg_00.bits.ID != mp_ioapics[apic_id].apicid)
+ if (reg_00.bits.ID != mpc_ioapic_id(apic_id))
printk("could not set ID!\n");
else
apic_printk(APIC_VERBOSE, " ok.\n");
@@ -2404,7 +2430,7 @@ static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
raw_spin_lock_irqsave(&ioapic_lock, flags);
for_each_irq_pin(entry, cfg->irq_2_pin) {
- if (mp_ioapics[entry->apic].apicver >= 0x20) {
+ if (mpc_ioapic_ver(entry->apic) >= 0x20) {
/*
* Intr-remapping uses pin number as the virtual vector
* in the RTE. Actual vector is programmed in
@@ -2918,49 +2944,19 @@ static int __init io_apic_bug_finalize(void)
late_initcall(io_apic_bug_finalize);
-static struct IO_APIC_route_entry *ioapic_saved_data[MAX_IO_APICS];
-
-static void suspend_ioapic(int ioapic_id)
+static void resume_ioapic_id(int ioapic_id)
{
- struct IO_APIC_route_entry *saved_data = ioapic_saved_data[ioapic_id];
- int i;
-
- if (!saved_data)
- return;
-
- for (i = 0; i < nr_ioapic_registers[ioapic_id]; i++)
- saved_data[i] = ioapic_read_entry(ioapic_id, i);
-}
-
-static int ioapic_suspend(void)
-{
- int ioapic_id;
-
- for (ioapic_id = 0; ioapic_id < nr_ioapics; ioapic_id++)
- suspend_ioapic(ioapic_id);
-
- return 0;
-}
-
-static void resume_ioapic(int ioapic_id)
-{
- struct IO_APIC_route_entry *saved_data = ioapic_saved_data[ioapic_id];
unsigned long flags;
union IO_APIC_reg_00 reg_00;
- int i;
- if (!saved_data)
- return;
raw_spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(ioapic_id, 0);
- if (reg_00.bits.ID != mp_ioapics[ioapic_id].apicid) {
- reg_00.bits.ID = mp_ioapics[ioapic_id].apicid;
+ if (reg_00.bits.ID != mpc_ioapic_id(ioapic_id)) {
+ reg_00.bits.ID = mpc_ioapic_id(ioapic_id);
io_apic_write(ioapic_id, 0, reg_00.raw);
}
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
- for (i = 0; i < nr_ioapic_registers[ioapic_id]; i++)
- ioapic_write_entry(ioapic_id, i, saved_data[i]);
}
static void ioapic_resume(void)
@@ -2968,28 +2964,18 @@ static void ioapic_resume(void)
int ioapic_id;
for (ioapic_id = nr_ioapics - 1; ioapic_id >= 0; ioapic_id--)
- resume_ioapic(ioapic_id);
+ resume_ioapic_id(ioapic_id);
+
+ restore_ioapic_entries();
}
static struct syscore_ops ioapic_syscore_ops = {
- .suspend = ioapic_suspend,
+ .suspend = save_ioapic_entries,
.resume = ioapic_resume,
};
static int __init ioapic_init_ops(void)
{
- int i;
-
- for (i = 0; i < nr_ioapics; i++) {
- unsigned int size;
-
- size = nr_ioapic_registers[i]
- * sizeof(struct IO_APIC_route_entry);
- ioapic_saved_data[i] = kzalloc(size, GFP_KERNEL);
- if (!ioapic_saved_data[i])
- pr_err("IOAPIC %d: suspend/resume impossible!\n", i);
- }
-
register_syscore_ops(&ioapic_syscore_ops);
return 0;
@@ -3570,7 +3556,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
}
#endif /* CONFIG_HT_IRQ */
-int
+static int
io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
{
struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node);
@@ -3585,21 +3571,21 @@ io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
return ret;
}
-static int io_apic_setup_irq_pin_once(unsigned int irq, int node,
- struct io_apic_irq_attr *attr)
+int io_apic_setup_irq_pin_once(unsigned int irq, int node,
+ struct io_apic_irq_attr *attr)
{
unsigned int id = attr->ioapic, pin = attr->ioapic_pin;
int ret;
/* Avoid redundant programming */
- if (test_bit(pin, mp_ioapic_routing[id].pin_programmed)) {
+ if (test_bit(pin, ioapics[id].pin_programmed)) {
pr_debug("Pin %d-%d already programmed\n",
- mp_ioapics[id].apicid, pin);
+ mpc_ioapic_id(id), pin);
return 0;
}
ret = io_apic_setup_irq_pin(irq, node, attr);
if (!ret)
- set_bit(pin, mp_ioapic_routing[id].pin_programmed);
+ set_bit(pin, ioapics[id].pin_programmed);
return ret;
}
@@ -3764,8 +3750,7 @@ static u8 __init io_apic_unique_id(u8 id)
bitmap_zero(used, 256);
for (i = 0; i < nr_ioapics; i++) {
- struct mpc_ioapic *ia = &mp_ioapics[i];
- __set_bit(ia->apicid, used);
+ __set_bit(mpc_ioapic_id(i), used);
}
if (!test_bit(id, used))
return id;
@@ -3825,7 +3810,7 @@ void __init setup_ioapic_dest(void)
return;
for (ioapic = 0; ioapic < nr_ioapics; ioapic++)
- for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
+ for (pin = 0; pin < ioapics[ioapic].nr_registers; pin++) {
irq_entry = find_irq_entry(ioapic, pin, mp_INT);
if (irq_entry == -1)
continue;
@@ -3896,7 +3881,7 @@ void __init ioapic_and_gsi_init(void)
ioapic_res = ioapic_setup_resources(nr_ioapics);
for (i = 0; i < nr_ioapics; i++) {
if (smp_found_config) {
- ioapic_phys = mp_ioapics[i].apicaddr;
+ ioapic_phys = mpc_ioapic_addr(i);
#ifdef CONFIG_X86_32
if (!ioapic_phys) {
printk(KERN_ERR
@@ -3956,8 +3941,9 @@ int mp_find_ioapic(u32 gsi)
/* Find the IOAPIC that manages this GSI. */
for (i = 0; i < nr_ioapics; i++) {
- if ((gsi >= mp_gsi_routing[i].gsi_base)
- && (gsi <= mp_gsi_routing[i].gsi_end))
+ struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(i);
+ if ((gsi >= gsi_cfg->gsi_base)
+ && (gsi <= gsi_cfg->gsi_end))
return i;
}
@@ -3967,12 +3953,16 @@ int mp_find_ioapic(u32 gsi)
int mp_find_ioapic_pin(int ioapic, u32 gsi)
{
+ struct mp_ioapic_gsi *gsi_cfg;
+
if (WARN_ON(ioapic == -1))
return -1;
- if (WARN_ON(gsi > mp_gsi_routing[ioapic].gsi_end))
+
+ gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+ if (WARN_ON(gsi > gsi_cfg->gsi_end))
return -1;
- return gsi - mp_gsi_routing[ioapic].gsi_base;
+ return gsi - gsi_cfg->gsi_base;
}
static __init int bad_ioapic(unsigned long address)
@@ -3994,40 +3984,42 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
{
int idx = 0;
int entries;
+ struct mp_ioapic_gsi *gsi_cfg;
if (bad_ioapic(address))
return;
idx = nr_ioapics;
- mp_ioapics[idx].type = MP_IOAPIC;
- mp_ioapics[idx].flags = MPC_APIC_USABLE;
- mp_ioapics[idx].apicaddr = address;
+ ioapics[idx].mp_config.type = MP_IOAPIC;
+ ioapics[idx].mp_config.flags = MPC_APIC_USABLE;
+ ioapics[idx].mp_config.apicaddr = address;
set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
- mp_ioapics[idx].apicid = io_apic_unique_id(id);
- mp_ioapics[idx].apicver = io_apic_get_version(idx);
+ ioapics[idx].mp_config.apicid = io_apic_unique_id(id);
+ ioapics[idx].mp_config.apicver = io_apic_get_version(idx);
/*
* Build basic GSI lookup table to facilitate gsi->io_apic lookups
* and to prevent reprogramming of IOAPIC pins (PCI GSIs).
*/
entries = io_apic_get_redir_entries(idx);
- mp_gsi_routing[idx].gsi_base = gsi_base;
- mp_gsi_routing[idx].gsi_end = gsi_base + entries - 1;
+ gsi_cfg = mp_ioapic_gsi_routing(idx);
+ gsi_cfg->gsi_base = gsi_base;
+ gsi_cfg->gsi_end = gsi_base + entries - 1;
/*
* The number of IO-APIC IRQ registers (== #pins):
*/
- nr_ioapic_registers[idx] = entries;
+ ioapics[idx].nr_registers = entries;
- if (mp_gsi_routing[idx].gsi_end >= gsi_top)
- gsi_top = mp_gsi_routing[idx].gsi_end + 1;
+ if (gsi_cfg->gsi_end >= gsi_top)
+ gsi_top = gsi_cfg->gsi_end + 1;
printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
- "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
- mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr,
- mp_gsi_routing[idx].gsi_base, mp_gsi_routing[idx].gsi_end);
+ "GSI %d-%d\n", idx, mpc_ioapic_id(idx),
+ mpc_ioapic_ver(idx), mpc_ioapic_addr(idx),
+ gsi_cfg->gsi_base, gsi_cfg->gsi_end);
nr_ioapics++;
}
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 6273eee5134..c4a61ca1349 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -48,8 +48,6 @@
#include <asm/e820.h>
#include <asm/ipi.h>
-#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
-
int found_numaq;
/*
@@ -79,31 +77,20 @@ int quad_local_to_mp_bus_id[NR_CPUS/4][4];
static inline void numaq_register_node(int node, struct sys_cfg_data *scd)
{
struct eachquadmem *eq = scd->eq + node;
+ u64 start = (u64)(eq->hi_shrd_mem_start - eq->priv_mem_size) << 20;
+ u64 end = (u64)(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size) << 20;
+ int ret;
- node_set_online(node);
-
- /* Convert to pages */
- node_start_pfn[node] =
- MB_TO_PAGES(eq->hi_shrd_mem_start - eq->priv_mem_size);
-
- node_end_pfn[node] =
- MB_TO_PAGES(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
-
- memblock_x86_register_active_regions(node, node_start_pfn[node],
- node_end_pfn[node]);
-
- memory_present(node, node_start_pfn[node], node_end_pfn[node]);
-
- node_remap_size[node] = node_memmap_size_bytes(node,
- node_start_pfn[node],
- node_end_pfn[node]);
+ node_set(node, numa_nodes_parsed);
+ ret = numa_add_memblk(node, start, end);
+ BUG_ON(ret < 0);
}
/*
* Function: smp_dump_qct()
*
* Description: gets memory layout from the quad config table. This
- * function also updates node_online_map with the nodes (quads) present.
+ * function also updates numa_nodes_parsed with the nodes (quads) present.
*/
static void __init smp_dump_qct(void)
{
@@ -112,7 +99,6 @@ static void __init smp_dump_qct(void)
scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR);
- nodes_clear(node_online_map);
for_each_node(node) {
if (scd->quads_present31_0 & (1 << node))
numaq_register_node(node, scd);
@@ -282,14 +268,14 @@ static __init void early_check_numaq(void)
}
}
-int __init get_memcfg_numaq(void)
+int __init numaq_numa_init(void)
{
early_check_numaq();
if (!found_numaq)
- return 0;
+ return -ENOENT;
smp_dump_qct();
- return 1;
+ return 0;
}
#define NUMAQ_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
@@ -486,8 +472,8 @@ static void numaq_setup_portio_remap(void)
(u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD);
}
-/* Use __refdata to keep false positive warning calm. */
-struct apic __refdata apic_numaq = {
+/* Use __refdata to keep false positive warning calm. */
+static struct apic __refdata apic_numaq = {
.name = "NUMAQ",
.probe = probe_numaq,
@@ -551,3 +537,5 @@ struct apic __refdata apic_numaq = {
.x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid,
.x86_32_numa_cpu_node = numaq_numa_cpu_node,
};
+
+apic_driver(apic_numaq);
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index fc84c7b6110..b5254ad044a 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -52,31 +52,6 @@ static int __init print_ipi_mode(void)
}
late_initcall(print_ipi_mode);
-void __init default_setup_apic_routing(void)
-{
- int version = apic_version[boot_cpu_physical_apicid];
-
- if (num_possible_cpus() > 8) {
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_INTEL:
- if (!APIC_XAPIC(version)) {
- def_to_bigsmp = 0;
- break;
- }
- /* If P4 and above fall through */
- case X86_VENDOR_AMD:
- def_to_bigsmp = 1;
- }
- }
-
-#ifdef CONFIG_X86_BIGSMP
- generic_bigsmp_probe();
-#endif
-
- if (apic->setup_apic_routing)
- apic->setup_apic_routing();
-}
-
static int default_x86_32_early_logical_apicid(int cpu)
{
return 1 << cpu;
@@ -112,7 +87,7 @@ static int probe_default(void)
return 1;
}
-struct apic apic_default = {
+static struct apic apic_default = {
.name = "default",
.probe = probe_default,
@@ -172,47 +147,24 @@ struct apic apic_default = {
.safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
.x86_32_early_logical_apicid = default_x86_32_early_logical_apicid,
- .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node,
};
-extern struct apic apic_numaq;
-extern struct apic apic_summit;
-extern struct apic apic_bigsmp;
-extern struct apic apic_es7000;
-extern struct apic apic_es7000_cluster;
+apic_driver(apic_default);
struct apic *apic = &apic_default;
EXPORT_SYMBOL_GPL(apic);
-static struct apic *apic_probe[] __initdata = {
-#ifdef CONFIG_X86_NUMAQ
- &apic_numaq,
-#endif
-#ifdef CONFIG_X86_SUMMIT
- &apic_summit,
-#endif
-#ifdef CONFIG_X86_BIGSMP
- &apic_bigsmp,
-#endif
-#ifdef CONFIG_X86_ES7000
- &apic_es7000,
- &apic_es7000_cluster,
-#endif
- &apic_default, /* must be last */
- NULL,
-};
-
static int cmdline_apic __initdata;
static int __init parse_apic(char *arg)
{
- int i;
+ struct apic **drv;
if (!arg)
return -EINVAL;
- for (i = 0; apic_probe[i]; i++) {
- if (!strcmp(apic_probe[i]->name, arg)) {
- apic = apic_probe[i];
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if (!strcmp((*drv)->name, arg)) {
+ apic = *drv;
cmdline_apic = 1;
return 0;
}
@@ -223,38 +175,58 @@ static int __init parse_apic(char *arg)
}
early_param("apic", parse_apic);
-void __init generic_bigsmp_probe(void)
+void __init default_setup_apic_routing(void)
{
+ int version = apic_version[boot_cpu_physical_apicid];
+
+ if (num_possible_cpus() > 8) {
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ if (!APIC_XAPIC(version)) {
+ def_to_bigsmp = 0;
+ break;
+ }
+ /* If P4 and above fall through */
+ case X86_VENDOR_AMD:
+ def_to_bigsmp = 1;
+ }
+ }
+
#ifdef CONFIG_X86_BIGSMP
/*
- * This routine is used to switch to bigsmp mode when
+ * This is used to switch to bigsmp mode when
* - There is no apic= option specified by the user
* - generic_apic_probe() has chosen apic_default as the sub_arch
* - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support
*/
if (!cmdline_apic && apic == &apic_default) {
- if (apic_bigsmp.probe()) {
- apic = &apic_bigsmp;
+ struct apic *bigsmp = generic_bigsmp_probe();
+ if (bigsmp) {
+ apic = bigsmp;
printk(KERN_INFO "Overriding APIC driver with %s\n",
apic->name);
}
}
#endif
+
+ if (apic->setup_apic_routing)
+ apic->setup_apic_routing();
}
void __init generic_apic_probe(void)
{
if (!cmdline_apic) {
- int i;
- for (i = 0; apic_probe[i]; i++) {
- if (apic_probe[i]->probe()) {
- apic = apic_probe[i];
+ struct apic **drv;
+
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if ((*drv)->probe()) {
+ apic = *drv;
break;
}
}
/* Not visible without early console */
- if (!apic_probe[i])
+ if (drv == __apicdrivers_end)
panic("Didn't find an APIC driver");
}
printk(KERN_INFO "Using APIC driver %s\n", apic->name);
@@ -265,16 +237,16 @@ void __init generic_apic_probe(void)
int __init
generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
{
- int i;
+ struct apic **drv;
- for (i = 0; apic_probe[i]; ++i) {
- if (!apic_probe[i]->mps_oem_check)
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if (!((*drv)->mps_oem_check))
continue;
- if (!apic_probe[i]->mps_oem_check(mpc, oem, productid))
+ if (!(*drv)->mps_oem_check(mpc, oem, productid))
continue;
if (!cmdline_apic) {
- apic = apic_probe[i];
+ apic = *drv;
printk(KERN_INFO "Switched to APIC driver `%s'.\n",
apic->name);
}
@@ -285,16 +257,16 @@ generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
- int i;
+ struct apic **drv;
- for (i = 0; apic_probe[i]; ++i) {
- if (!apic_probe[i]->acpi_madt_oem_check)
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if (!(*drv)->acpi_madt_oem_check)
continue;
- if (!apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id))
+ if (!(*drv)->acpi_madt_oem_check(oem_id, oem_table_id))
continue;
if (!cmdline_apic) {
- apic = apic_probe[i];
+ apic = *drv;
printk(KERN_INFO "Switched to APIC driver `%s'.\n",
apic->name);
}
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index d8c4a6feb28..3fe98669892 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -23,27 +23,6 @@
#include <asm/ipi.h>
#include <asm/setup.h>
-extern struct apic apic_flat;
-extern struct apic apic_physflat;
-extern struct apic apic_x2xpic_uv_x;
-extern struct apic apic_x2apic_phys;
-extern struct apic apic_x2apic_cluster;
-
-struct apic __read_mostly *apic = &apic_flat;
-EXPORT_SYMBOL_GPL(apic);
-
-static struct apic *apic_probe[] __initdata = {
-#ifdef CONFIG_X86_UV
- &apic_x2apic_uv_x,
-#endif
-#ifdef CONFIG_X86_X2APIC
- &apic_x2apic_phys,
- &apic_x2apic_cluster,
-#endif
- &apic_physflat,
- NULL,
-};
-
static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
{
return hard_smp_processor_id() >> index_msb;
@@ -54,26 +33,20 @@ static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
*/
void __init default_setup_apic_routing(void)
{
+ struct apic **drv;
enable_IR_x2apic();
-#ifdef CONFIG_X86_X2APIC
- if (x2apic_mode
-#ifdef CONFIG_X86_UV
- && apic != &apic_x2apic_uv_x
-#endif
- ) {
- if (x2apic_phys)
- apic = &apic_x2apic_phys;
- else
- apic = &apic_x2apic_cluster;
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if ((*drv)->probe && (*drv)->probe()) {
+ if (apic != *drv) {
+ apic = *drv;
+ pr_info("Switched APIC routing to %s.\n",
+ apic->name);
+ }
+ break;
+ }
}
-#endif
-
- if (apic == &apic_flat && num_possible_cpus() > 8)
- apic = &apic_physflat;
-
- printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
if (is_vsmp_box()) {
/* need to update phys_pkg_id */
@@ -90,13 +63,15 @@ void apic_send_IPI_self(int vector)
int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
- int i;
+ struct apic **drv;
- for (i = 0; apic_probe[i]; ++i) {
- if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) {
- apic = apic_probe[i];
- printk(KERN_INFO "Setting APIC routing to %s.\n",
- apic->name);
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if ((*drv)->acpi_madt_oem_check(oem_id, oem_table_id)) {
+ if (apic != *drv) {
+ apic = *drv;
+ pr_info("Setting APIC routing to %s.\n",
+ apic->name);
+ }
return 1;
}
}
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index e4b8059b414..19114423c58 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -491,7 +491,7 @@ void setup_summit(void)
}
#endif
-struct apic apic_summit = {
+static struct apic apic_summit = {
.name = "summit",
.probe = probe_summit,
@@ -551,5 +551,6 @@ struct apic apic_summit = {
.safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
.x86_32_early_logical_apicid = summit_early_logical_apicid,
- .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node,
};
+
+apic_driver(apic_summit);
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 90949bbd566..50079587582 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -5,118 +5,95 @@
#include <linux/ctype.h>
#include <linux/init.h>
#include <linux/dmar.h>
+#include <linux/cpu.h>
#include <asm/smp.h>
-#include <asm/apic.h>
-#include <asm/ipi.h>
+#include <asm/x2apic.h>
static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
+static DEFINE_PER_CPU(cpumask_var_t, cpus_in_cluster);
+static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
return x2apic_enabled();
}
-/*
- * need to use more than cpu 0, because we need more vectors when
- * MSI-X are used.
- */
-static const struct cpumask *x2apic_target_cpus(void)
+static inline u32 x2apic_cluster(int cpu)
{
- return cpu_online_mask;
-}
-
-/*
- * for now each logical cpu is in its own vector allocation domain.
- */
-static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
- cpumask_clear(retmask);
- cpumask_set_cpu(cpu, retmask);
+ return per_cpu(x86_cpu_to_logical_apicid, cpu) >> 16;
}
static void
- __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
+__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
{
- unsigned long cfg;
+ struct cpumask *cpus_in_cluster_ptr;
+ struct cpumask *ipi_mask_ptr;
+ unsigned int cpu, this_cpu;
+ unsigned long flags;
+ u32 dest;
+
+ x2apic_wrmsr_fence();
+
+ local_irq_save(flags);
- cfg = __prepare_ICR(0, vector, dest);
+ this_cpu = smp_processor_id();
/*
- * send the IPI.
+ * We are to modify mask, so we need an own copy
+ * and be sure it's manipulated with irq off.
*/
- native_x2apic_icr_write(cfg, apicid);
-}
+ ipi_mask_ptr = __raw_get_cpu_var(ipi_mask);
+ cpumask_copy(ipi_mask_ptr, mask);
-/*
- * for now, we send the IPI's one by one in the cpumask.
- * TBD: Based on the cpu mask, we can send the IPI's to the cluster group
- * at once. We have 16 cpu's in a cluster. This will minimize IPI register
- * writes.
- */
-static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
-{
- unsigned long query_cpu;
- unsigned long flags;
+ /*
+ * The idea is to send one IPI per cluster.
+ */
+ for_each_cpu(cpu, ipi_mask_ptr) {
+ unsigned long i;
- x2apic_wrmsr_fence();
+ cpus_in_cluster_ptr = per_cpu(cpus_in_cluster, cpu);
+ dest = 0;
- local_irq_save(flags);
- for_each_cpu(query_cpu, mask) {
- __x2apic_send_IPI_dest(
- per_cpu(x86_cpu_to_logical_apicid, query_cpu),
- vector, apic->dest_logical);
+ /* Collect cpus in cluster. */
+ for_each_cpu_and(i, ipi_mask_ptr, cpus_in_cluster_ptr) {
+ if (apic_dest == APIC_DEST_ALLINC || i != this_cpu)
+ dest |= per_cpu(x86_cpu_to_logical_apicid, i);
+ }
+
+ if (!dest)
+ continue;
+
+ __x2apic_send_IPI_dest(dest, vector, apic->dest_logical);
+ /*
+ * Cluster sibling cpus should be discared now so
+ * we would not send IPI them second time.
+ */
+ cpumask_andnot(ipi_mask_ptr, ipi_mask_ptr, cpus_in_cluster_ptr);
}
+
local_irq_restore(flags);
}
+static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+ __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC);
+}
+
static void
x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
{
- unsigned long this_cpu = smp_processor_id();
- unsigned long query_cpu;
- unsigned long flags;
-
- x2apic_wrmsr_fence();
-
- local_irq_save(flags);
- for_each_cpu(query_cpu, mask) {
- if (query_cpu == this_cpu)
- continue;
- __x2apic_send_IPI_dest(
- per_cpu(x86_cpu_to_logical_apicid, query_cpu),
- vector, apic->dest_logical);
- }
- local_irq_restore(flags);
+ __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
}
static void x2apic_send_IPI_allbutself(int vector)
{
- unsigned long this_cpu = smp_processor_id();
- unsigned long query_cpu;
- unsigned long flags;
-
- x2apic_wrmsr_fence();
-
- local_irq_save(flags);
- for_each_online_cpu(query_cpu) {
- if (query_cpu == this_cpu)
- continue;
- __x2apic_send_IPI_dest(
- per_cpu(x86_cpu_to_logical_apicid, query_cpu),
- vector, apic->dest_logical);
- }
- local_irq_restore(flags);
+ __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT);
}
static void x2apic_send_IPI_all(int vector)
{
- x2apic_send_IPI_mask(cpu_online_mask, vector);
-}
-
-static int x2apic_apic_id_registered(void)
-{
- return 1;
+ __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
}
static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
@@ -151,43 +128,90 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
return per_cpu(x86_cpu_to_logical_apicid, cpu);
}
-static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x)
+static void init_x2apic_ldr(void)
{
- unsigned int id;
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int cpu;
- id = x;
- return id;
+ per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR);
+
+ __cpu_set(this_cpu, per_cpu(cpus_in_cluster, this_cpu));
+ for_each_online_cpu(cpu) {
+ if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
+ continue;
+ __cpu_set(this_cpu, per_cpu(cpus_in_cluster, cpu));
+ __cpu_set(cpu, per_cpu(cpus_in_cluster, this_cpu));
+ }
}
-static unsigned long set_apic_id(unsigned int id)
+ /*
+ * At CPU state changes, update the x2apic cluster sibling info.
+ */
+static int __cpuinit
+update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
- unsigned long x;
+ unsigned int this_cpu = (unsigned long)hcpu;
+ unsigned int cpu;
+ int err = 0;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, this_cpu),
+ GFP_KERNEL)) {
+ err = -ENOMEM;
+ } else if (!zalloc_cpumask_var(&per_cpu(ipi_mask, this_cpu),
+ GFP_KERNEL)) {
+ free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
+ err = -ENOMEM;
+ }
+ break;
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
+ case CPU_DEAD:
+ for_each_online_cpu(cpu) {
+ if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
+ continue;
+ __cpu_clear(this_cpu, per_cpu(cpus_in_cluster, cpu));
+ __cpu_clear(cpu, per_cpu(cpus_in_cluster, this_cpu));
+ }
+ free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
+ free_cpumask_var(per_cpu(ipi_mask, this_cpu));
+ break;
+ }
- x = id;
- return x;
+ return notifier_from_errno(err);
}
-static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb)
-{
- return initial_apicid >> index_msb;
-}
+static struct notifier_block __refdata x2apic_cpu_notifier = {
+ .notifier_call = update_clusterinfo,
+};
-static void x2apic_send_IPI_self(int vector)
+static int x2apic_init_cpu_notifier(void)
{
- apic_write(APIC_SELF_IPI, vector);
+ int cpu = smp_processor_id();
+
+ zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL);
+ zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL);
+
+ BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu));
+
+ __cpu_set(cpu, per_cpu(cpus_in_cluster, cpu));
+ register_hotcpu_notifier(&x2apic_cpu_notifier);
+ return 1;
}
-static void init_x2apic_ldr(void)
+static int x2apic_cluster_probe(void)
{
- int cpu = smp_processor_id();
-
- per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR);
+ if (x2apic_mode)
+ return x2apic_init_cpu_notifier();
+ else
+ return 0;
}
-struct apic apic_x2apic_cluster = {
+static struct apic apic_x2apic_cluster = {
.name = "cluster x2apic",
- .probe = NULL,
+ .probe = x2apic_cluster_probe,
.acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
.apic_id_registered = x2apic_apic_id_registered,
@@ -211,11 +235,11 @@ struct apic apic_x2apic_cluster = {
.setup_portio_remap = NULL,
.check_phys_apicid_present = default_check_phys_apicid_present,
.enable_apic_mode = NULL,
- .phys_pkg_id = x2apic_cluster_phys_pkg_id,
+ .phys_pkg_id = x2apic_phys_pkg_id,
.mps_oem_check = NULL,
- .get_apic_id = x2apic_cluster_phys_get_apic_id,
- .set_apic_id = set_apic_id,
+ .get_apic_id = x2apic_get_apic_id,
+ .set_apic_id = x2apic_set_apic_id,
.apic_id_mask = 0xFFFFFFFFu,
.cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
@@ -240,3 +264,5 @@ struct apic apic_x2apic_cluster = {
.wait_icr_idle = native_x2apic_wait_icr_idle,
.safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
};
+
+apic_driver(apic_x2apic_cluster);
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index c7e6d6645bf..f5373dfde21 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -7,11 +7,12 @@
#include <linux/dmar.h>
#include <asm/smp.h>
-#include <asm/apic.h>
-#include <asm/ipi.h>
+#include <asm/x2apic.h>
int x2apic_phys;
+static struct apic apic_x2apic_phys;
+
static int set_x2apic_phys_mode(char *arg)
{
x2apic_phys = 1;
@@ -27,94 +28,46 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
return 0;
}
-/*
- * need to use more than cpu 0, because we need more vectors when
- * MSI-X are used.
- */
-static const struct cpumask *x2apic_target_cpus(void)
-{
- return cpu_online_mask;
-}
-
-static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
- cpumask_clear(retmask);
- cpumask_set_cpu(cpu, retmask);
-}
-
-static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
- unsigned int dest)
-{
- unsigned long cfg;
-
- cfg = __prepare_ICR(0, vector, dest);
-
- /*
- * send the IPI.
- */
- native_x2apic_icr_write(cfg, apicid);
-}
-
-static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
+static void
+__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
{
unsigned long query_cpu;
+ unsigned long this_cpu;
unsigned long flags;
x2apic_wrmsr_fence();
local_irq_save(flags);
+
+ this_cpu = smp_processor_id();
for_each_cpu(query_cpu, mask) {
+ if (apic_dest == APIC_DEST_ALLBUT && this_cpu == query_cpu)
+ continue;
__x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
vector, APIC_DEST_PHYSICAL);
}
local_irq_restore(flags);
}
+static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+ __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC);
+}
+
static void
x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
{
- unsigned long this_cpu = smp_processor_id();
- unsigned long query_cpu;
- unsigned long flags;
-
- x2apic_wrmsr_fence();
-
- local_irq_save(flags);
- for_each_cpu(query_cpu, mask) {
- if (query_cpu != this_cpu)
- __x2apic_send_IPI_dest(
- per_cpu(x86_cpu_to_apicid, query_cpu),
- vector, APIC_DEST_PHYSICAL);
- }
- local_irq_restore(flags);
+ __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
}
static void x2apic_send_IPI_allbutself(int vector)
{
- unsigned long this_cpu = smp_processor_id();
- unsigned long query_cpu;
- unsigned long flags;
-
- x2apic_wrmsr_fence();
-
- local_irq_save(flags);
- for_each_online_cpu(query_cpu) {
- if (query_cpu == this_cpu)
- continue;
- __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
- vector, APIC_DEST_PHYSICAL);
- }
- local_irq_restore(flags);
+ __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT);
}
static void x2apic_send_IPI_all(int vector)
{
- x2apic_send_IPI_mask(cpu_online_mask, vector);
-}
-
-static int x2apic_apic_id_registered(void)
-{
- return 1;
+ __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
}
static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
@@ -149,34 +102,22 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
return per_cpu(x86_cpu_to_apicid, cpu);
}
-static unsigned int x2apic_phys_get_apic_id(unsigned long x)
-{
- return x;
-}
-
-static unsigned long set_apic_id(unsigned int id)
-{
- return id;
-}
-
-static int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
+static void init_x2apic_ldr(void)
{
- return initial_apicid >> index_msb;
}
-static void x2apic_send_IPI_self(int vector)
+static int x2apic_phys_probe(void)
{
- apic_write(APIC_SELF_IPI, vector);
-}
+ if (x2apic_mode && x2apic_phys)
+ return 1;
-static void init_x2apic_ldr(void)
-{
+ return apic == &apic_x2apic_phys;
}
-struct apic apic_x2apic_phys = {
+static struct apic apic_x2apic_phys = {
.name = "physical x2apic",
- .probe = NULL,
+ .probe = x2apic_phys_probe,
.acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
.apic_id_registered = x2apic_apic_id_registered,
@@ -203,8 +144,8 @@ struct apic apic_x2apic_phys = {
.phys_pkg_id = x2apic_phys_pkg_id,
.mps_oem_check = NULL,
- .get_apic_id = x2apic_phys_get_apic_id,
- .set_apic_id = set_apic_id,
+ .get_apic_id = x2apic_get_apic_id,
+ .set_apic_id = x2apic_set_apic_id,
.apic_id_mask = 0xFFFFFFFFu,
.cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
@@ -229,3 +170,5 @@ struct apic apic_x2apic_phys = {
.wait_icr_idle = native_x2apic_wait_icr_idle,
.safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
};
+
+apic_driver(apic_x2apic_phys);
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 33b10a0fc09..adc66c3a1fe 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -37,6 +37,13 @@
#include <asm/smp.h>
#include <asm/x86_init.h>
#include <asm/emergency-restart.h>
+#include <asm/nmi.h>
+
+/* BMC sets a bit this MMR non-zero before sending an NMI */
+#define UVH_NMI_MMR UVH_SCRATCH5
+#define UVH_NMI_MMR_CLEAR (UVH_NMI_MMR + 8)
+#define UV_NMI_PENDING_MASK (1UL << 63)
+DEFINE_PER_CPU(unsigned long, cpu_last_nmi_count);
DEFINE_PER_CPU(int, x2apic_extra_bits);
@@ -51,6 +58,8 @@ unsigned int uv_apicid_hibits;
EXPORT_SYMBOL_GPL(uv_apicid_hibits);
static DEFINE_SPINLOCK(uv_nmi_lock);
+static struct apic apic_x2apic_uv_x;
+
static unsigned long __init uv_early_read_mmr(unsigned long addr)
{
unsigned long val, *mmr;
@@ -82,6 +91,10 @@ static int __init early_get_pnodeid(void)
m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR);
uv_min_hub_revision_id = node_id.s.revision;
+ if (node_id.s.part_number == UV2_HUB_PART_NUMBER)
+ uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1;
+
+ uv_hub_info->hub_revision = uv_min_hub_revision_id;
pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1);
return pnode;
}
@@ -103,17 +116,25 @@ static void __init early_get_apic_pnode_shift(void)
*/
static void __init uv_set_apicid_hibit(void)
{
- union uvh_lb_target_physical_apic_id_mask_u apicid_mask;
+ union uv1h_lb_target_physical_apic_id_mask_u apicid_mask;
- apicid_mask.v = uv_early_read_mmr(UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK);
- uv_apicid_hibits = apicid_mask.s.bit_enables & UV_APICID_HIBIT_MASK;
+ if (is_uv1_hub()) {
+ apicid_mask.v =
+ uv_early_read_mmr(UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK);
+ uv_apicid_hibits =
+ apicid_mask.s1.bit_enables & UV_APICID_HIBIT_MASK;
+ }
}
static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
- int pnodeid;
+ int pnodeid, is_uv1, is_uv2;
- if (!strcmp(oem_id, "SGI")) {
+ is_uv1 = !strcmp(oem_id, "SGI");
+ is_uv2 = !strcmp(oem_id, "SGI2");
+ if (is_uv1 || is_uv2) {
+ uv_hub_info->hub_revision =
+ is_uv1 ? UV1_HUB_REVISION_BASE : UV2_HUB_REVISION_BASE;
pnodeid = early_get_pnodeid();
early_get_apic_pnode_shift();
x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
@@ -319,10 +340,15 @@ static void uv_send_IPI_self(int vector)
apic_write(APIC_SELF_IPI, vector);
}
-struct apic __refdata apic_x2apic_uv_x = {
+static int uv_probe(void)
+{
+ return apic == &apic_x2apic_uv_x;
+}
+
+static struct apic __refdata apic_x2apic_uv_x = {
.name = "UV large system",
- .probe = NULL,
+ .probe = uv_probe,
.acpi_madt_oem_check = uv_acpi_madt_oem_check,
.apic_id_registered = uv_apic_id_registered,
@@ -470,12 +496,19 @@ static __init void map_mmr_high(int max_pnode)
static __init void map_mmioh_high(int max_pnode)
{
union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
- int shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
+ int shift;
mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
- if (mmioh.s.enable)
- map_high("MMIOH", mmioh.s.base, shift, mmioh.s.m_io,
+ if (is_uv1_hub() && mmioh.s1.enable) {
+ shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
+ map_high("MMIOH", mmioh.s1.base, shift, mmioh.s1.m_io,
+ max_pnode, map_uc);
+ }
+ if (is_uv2_hub() && mmioh.s2.enable) {
+ shift = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
+ map_high("MMIOH", mmioh.s2.base, shift, mmioh.s2.m_io,
max_pnode, map_uc);
+ }
}
static __init void map_low_mmrs(void)
@@ -599,14 +632,14 @@ late_initcall(uv_init_heartbeat);
/* Direct Legacy VGA I/O traffic to designated IOH */
int uv_set_vga_state(struct pci_dev *pdev, bool decode,
- unsigned int command_bits, bool change_bridge)
+ unsigned int command_bits, u32 flags)
{
int domain, bus, rc;
- PR_DEVEL("devfn %x decode %d cmd %x chg_brdg %d\n",
- pdev->devfn, decode, command_bits, change_bridge);
+ PR_DEVEL("devfn %x decode %d cmd %x flags %d\n",
+ pdev->devfn, decode, command_bits, flags);
- if (!change_bridge)
+ if (!(flags & PCI_VGA_STATE_CHANGE_BRIDGE))
return 0;
if ((command_bits & PCI_COMMAND_IO) == 0)
@@ -642,18 +675,46 @@ void __cpuinit uv_cpu_init(void)
*/
int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
{
+ unsigned long real_uv_nmi;
+ int bid;
+
if (reason != DIE_NMIUNKNOWN)
return NOTIFY_OK;
if (in_crash_kexec)
/* do nothing if entering the crash kernel */
return NOTIFY_OK;
+
/*
- * Use a lock so only one cpu prints at a time
- * to prevent intermixed output.
+ * Each blade has an MMR that indicates when an NMI has been sent
+ * to cpus on the blade. If an NMI is detected, atomically
+ * clear the MMR and update a per-blade NMI count used to
+ * cause each cpu on the blade to notice a new NMI.
+ */
+ bid = uv_numa_blade_id();
+ real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK);
+
+ if (unlikely(real_uv_nmi)) {
+ spin_lock(&uv_blade_info[bid].nmi_lock);
+ real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK);
+ if (real_uv_nmi) {
+ uv_blade_info[bid].nmi_count++;
+ uv_write_local_mmr(UVH_NMI_MMR_CLEAR, UV_NMI_PENDING_MASK);
+ }
+ spin_unlock(&uv_blade_info[bid].nmi_lock);
+ }
+
+ if (likely(__get_cpu_var(cpu_last_nmi_count) == uv_blade_info[bid].nmi_count))
+ return NOTIFY_DONE;
+
+ __get_cpu_var(cpu_last_nmi_count) = uv_blade_info[bid].nmi_count;
+
+ /*
+ * Use a lock so only one cpu prints at a time.
+ * This prevents intermixed output.
*/
spin_lock(&uv_nmi_lock);
- pr_info("NMI stack dump cpu %u:\n", smp_processor_id());
+ pr_info("UV NMI stack dump cpu %u:\n", smp_processor_id());
dump_stack();
spin_unlock(&uv_nmi_lock);
@@ -661,7 +722,8 @@ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
}
static struct notifier_block uv_dump_stack_nmi_nb = {
- .notifier_call = uv_handle_nmi
+ .notifier_call = uv_handle_nmi,
+ .priority = NMI_LOCAL_LOW_PRIOR - 1,
};
void uv_register_nmi_notifier(void)
@@ -693,13 +755,14 @@ void __init uv_system_init(void)
unsigned long mmr_base, present, paddr;
unsigned short pnode_mask, pnode_io_mask;
+ printk(KERN_INFO "UV: Found %s hub\n", is_uv1_hub() ? "UV1" : "UV2");
map_low_mmrs();
m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR );
m_val = m_n_config.s.m_skt;
n_val = m_n_config.s.n_skt;
mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
- n_io = mmioh.s.n_io;
+ n_io = is_uv1_hub() ? mmioh.s1.n_io : mmioh.s2.n_io;
mmr_base =
uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
~UV_MMR_ENABLE;
@@ -720,8 +783,9 @@ void __init uv_system_init(void)
printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades());
bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
- uv_blade_info = kmalloc(bytes, GFP_KERNEL);
+ uv_blade_info = kzalloc(bytes, GFP_KERNEL);
BUG_ON(!uv_blade_info);
+
for (blade = 0; blade < uv_num_possible_blades(); blade++)
uv_blade_info[blade].memory_nid = -1;
@@ -747,6 +811,7 @@ void __init uv_system_init(void)
uv_blade_info[blade].pnode = pnode;
uv_blade_info[blade].nr_possible_cpus = 0;
uv_blade_info[blade].nr_online_cpus = 0;
+ spin_lock_init(&uv_blade_info[blade].nmi_lock);
max_pnode = max(pnode, max_pnode);
blade++;
}
@@ -766,6 +831,8 @@ void __init uv_system_init(void)
*/
uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift;
+ uv_cpu_hub_info(cpu)->hub_revision = uv_hub_info->hub_revision;
+
pnode = uv_apicid_to_pnode(apicid);
blade = boot_pnode_to_blade(pnode);
lcpu = uv_blade_info[blade].nr_possible_cpus;
@@ -821,3 +888,5 @@ void __init uv_system_init(void)
if (is_kdump_kernel())
reboot_type = BOOT_ACPI;
}
+
+apic_driver(apic_x2apic_uv_x);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 0b4be431c62..0371c484bb8 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -228,11 +228,12 @@
#include <linux/kthread.h>
#include <linux/jiffies.h>
#include <linux/acpi.h>
+#include <linux/syscore_ops.h>
+#include <linux/i8253.h>
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/desc.h>
-#include <asm/i8253.h>
#include <asm/olpc.h>
#include <asm/paravirt.h>
#include <asm/reboot.h>
@@ -360,6 +361,7 @@ struct apm_user {
* idle percentage above which bios idle calls are done
*/
#ifdef CONFIG_APM_CPU_IDLE
+#warning deprecated CONFIG_APM_CPU_IDLE will be deleted in 2012
#define DEFAULT_IDLE_THRESHOLD 95
#else
#define DEFAULT_IDLE_THRESHOLD 100
@@ -903,6 +905,7 @@ static void apm_cpu_idle(void)
unsigned int jiffies_since_last_check = jiffies - last_jiffies;
unsigned int bucket;
+ WARN_ONCE(1, "deprecated apm_cpu_idle will be deleted in 2012");
recalc:
if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
use_apm_idle = 0;
@@ -1217,11 +1220,11 @@ static void reinit_timer(void)
raw_spin_lock_irqsave(&i8253_lock, flags);
/* set the clock to HZ */
- outb_pit(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
+ outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
udelay(10);
- outb_pit(LATCH & 0xff, PIT_CH0); /* LSB */
+ outb_p(LATCH & 0xff, PIT_CH0); /* LSB */
udelay(10);
- outb_pit(LATCH >> 8, PIT_CH0); /* MSB */
+ outb_p(LATCH >> 8, PIT_CH0); /* MSB */
udelay(10);
raw_spin_unlock_irqrestore(&i8253_lock, flags);
#endif
@@ -1237,7 +1240,7 @@ static int suspend(int vetoable)
dpm_suspend_noirq(PMSG_SUSPEND);
local_irq_disable();
- sysdev_suspend(PMSG_SUSPEND);
+ syscore_suspend();
local_irq_enable();
@@ -1255,7 +1258,7 @@ static int suspend(int vetoable)
apm_error("suspend", err);
err = (err == APM_SUCCESS) ? 0 : -EIO;
- sysdev_resume();
+ syscore_resume();
local_irq_enable();
dpm_resume_noirq(PMSG_RESUME);
@@ -1279,7 +1282,7 @@ static void standby(void)
dpm_suspend_noirq(PMSG_SUSPEND);
local_irq_disable();
- sysdev_suspend(PMSG_SUSPEND);
+ syscore_suspend();
local_irq_enable();
err = set_system_power_state(APM_STATE_STANDBY);
@@ -1287,7 +1290,7 @@ static void standby(void)
apm_error("standby", err);
local_irq_disable();
- sysdev_resume();
+ syscore_resume();
local_irq_enable();
dpm_resume_noirq(PMSG_RESUME);
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index c29d631af6f..395a10e6806 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -63,7 +63,6 @@ void foo(void)
BLANK();
OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
- OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
BLANK();
OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 3f0ebe429a0..6042981d030 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -30,7 +30,6 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o
obj-$(CONFIG_X86_MCE) += mcheck/
obj-$(CONFIG_MTRR) += mtrr/
-obj-$(CONFIG_CPU_FREQ) += cpufreq/
obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 3532d3bf810..b13ed393dfc 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -612,8 +612,11 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
}
#endif
- /* As a rule processors have APIC timer running in deep C states */
- if (c->x86 >= 0xf && !cpu_has_amd_erratum(amd_erratum_400))
+ /*
+ * Family 0x12 and above processors have APIC timer
+ * running in deep C states.
+ */
+ if (c->x86 > 0x11)
set_cpu_cap(c, X86_FEATURE_ARAT);
/*
@@ -629,10 +632,13 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
* Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012
*/
u64 mask;
+ int err;
- rdmsrl(MSR_AMD64_MCx_MASK(4), mask);
- mask |= (1 << 10);
- wrmsrl(MSR_AMD64_MCx_MASK(4), mask);
+ err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask);
+ if (err == 0) {
+ mask |= (1 << 10);
+ checking_wrmsrl(MSR_AMD64_MCx_MASK(4), mask);
+ }
}
}
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index c39576cb301..46674fbb62b 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -19,6 +19,7 @@
static int __init no_halt(char *s)
{
+ WARN_ONCE(1, "\"no-hlt\" is deprecated, please use \"idle=poll\"\n");
boot_cpu_data.hlt_works_ok = 0;
return 1;
}
@@ -61,6 +62,8 @@ static void __init check_fpu(void)
return;
}
+ kernel_fpu_begin();
+
/*
* trap_init() enabled FXSR and company _before_ testing for FP
* problems here.
@@ -79,6 +82,8 @@ static void __init check_fpu(void)
: "=m" (*&fdiv_bug)
: "m" (*&x), "m" (*&y));
+ kernel_fpu_end();
+
boot_cpu_data.fdiv_bug = fdiv_bug;
if (boot_cpu_data.fdiv_bug)
printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n");
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e2ced0074a4..62184390a60 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -21,7 +21,7 @@
#include <linux/topology.h>
#include <linux/cpumask.h>
#include <asm/pgtable.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <asm/proto.h>
#include <asm/setup.h>
#include <asm/apic.h>
@@ -254,6 +254,25 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
}
#endif
+static int disable_smep __cpuinitdata;
+static __init int setup_disable_smep(char *arg)
+{
+ disable_smep = 1;
+ return 1;
+}
+__setup("nosmep", setup_disable_smep);
+
+static __cpuinit void setup_smep(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_SMEP)) {
+ if (unlikely(disable_smep)) {
+ setup_clear_cpu_cap(X86_FEATURE_SMEP);
+ clear_in_cr4(X86_CR4_SMEP);
+ } else
+ set_in_cr4(X86_CR4_SMEP);
+ }
+}
+
/*
* Some CPU features depend on higher CPUID levels, which may not always
* be available due to CPUID level capping or broken virtualization
@@ -458,13 +477,6 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
if (smp_num_siblings <= 1)
goto out;
- if (smp_num_siblings > nr_cpu_ids) {
- pr_warning("CPU: Unsupported number of siblings %d",
- smp_num_siblings);
- smp_num_siblings = 1;
- return;
- }
-
index_msb = get_count_order(smp_num_siblings);
c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
@@ -565,8 +577,7 @@ void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
- if (eax > 0)
- c->x86_capability[9] = ebx;
+ c->x86_capability[9] = ebx;
}
/* AMD-defined flags: level 0x80000001 */
@@ -668,6 +679,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
c->cpu_index = 0;
#endif
filter_cpuid_features(c, false);
+
+ setup_smep(c);
}
void __init early_cpu_init(void)
@@ -753,6 +766,8 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
#endif
}
+ setup_smep(c);
+
get_model_name(c); /* Default name */
detect_nopl(c);
@@ -887,7 +902,7 @@ static void vgetcpu_set_mode(void)
void __init identify_boot_cpu(void)
{
identify_cpu(&boot_cpu_data);
- init_c1e_mask();
+ init_amd_e400_c1e_mask();
#ifdef CONFIG_X86_32
sysenter_setup();
enable_sep_cpu();
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
deleted file mode 100644
index 870e6cc6ad2..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ /dev/null
@@ -1,266 +0,0 @@
-#
-# CPU Frequency scaling
-#
-
-menu "CPU Frequency scaling"
-
-source "drivers/cpufreq/Kconfig"
-
-if CPU_FREQ
-
-comment "CPUFreq processor drivers"
-
-config X86_PCC_CPUFREQ
- tristate "Processor Clocking Control interface driver"
- depends on ACPI && ACPI_PROCESSOR
- help
- This driver adds support for the PCC interface.
-
- For details, take a look at:
- <file:Documentation/cpu-freq/pcc-cpufreq.txt>.
-
- To compile this driver as a module, choose M here: the
- module will be called pcc-cpufreq.
-
- If in doubt, say N.
-
-config X86_ACPI_CPUFREQ
- tristate "ACPI Processor P-States driver"
- select CPU_FREQ_TABLE
- depends on ACPI_PROCESSOR
- help
- This driver adds a CPUFreq driver which utilizes the ACPI
- Processor Performance States.
- This driver also supports Intel Enhanced Speedstep.
-
- To compile this driver as a module, choose M here: the
- module will be called acpi-cpufreq.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config ELAN_CPUFREQ
- tristate "AMD Elan SC400 and SC410"
- select CPU_FREQ_TABLE
- depends on X86_ELAN
- ---help---
- This adds the CPUFreq driver for AMD Elan SC400 and SC410
- processors.
-
- You need to specify the processor maximum speed as boot
- parameter: elanfreq=maxspeed (in kHz) or as module
- parameter "max_freq".
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config SC520_CPUFREQ
- tristate "AMD Elan SC520"
- select CPU_FREQ_TABLE
- depends on X86_ELAN
- ---help---
- This adds the CPUFreq driver for AMD Elan SC520 processor.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-
-config X86_POWERNOW_K6
- tristate "AMD Mobile K6-2/K6-3 PowerNow!"
- select CPU_FREQ_TABLE
- depends on X86_32
- help
- This adds the CPUFreq driver for mobile AMD K6-2+ and mobile
- AMD K6-3+ processors.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_POWERNOW_K7
- tristate "AMD Mobile Athlon/Duron PowerNow!"
- select CPU_FREQ_TABLE
- depends on X86_32
- help
- This adds the CPUFreq driver for mobile AMD K7 mobile processors.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_POWERNOW_K7_ACPI
- bool
- depends on X86_POWERNOW_K7 && ACPI_PROCESSOR
- depends on !(X86_POWERNOW_K7 = y && ACPI_PROCESSOR = m)
- depends on X86_32
- default y
-
-config X86_POWERNOW_K8
- tristate "AMD Opteron/Athlon64 PowerNow!"
- select CPU_FREQ_TABLE
- depends on ACPI && ACPI_PROCESSOR
- help
- This adds the CPUFreq driver for K8/K10 Opteron/Athlon64 processors.
-
- To compile this driver as a module, choose M here: the
- module will be called powernow-k8.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
-config X86_GX_SUSPMOD
- tristate "Cyrix MediaGX/NatSemi Geode Suspend Modulation"
- depends on X86_32 && PCI
- help
- This add the CPUFreq driver for NatSemi Geode processors which
- support suspend modulation.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_SPEEDSTEP_CENTRINO
- tristate "Intel Enhanced SpeedStep (deprecated)"
- select CPU_FREQ_TABLE
- select X86_SPEEDSTEP_CENTRINO_TABLE if X86_32
- depends on X86_32 || (X86_64 && ACPI_PROCESSOR)
- help
- This is deprecated and this functionality is now merged into
- acpi_cpufreq (X86_ACPI_CPUFREQ). Use that driver instead of
- speedstep_centrino.
- This adds the CPUFreq driver for Enhanced SpeedStep enabled
- mobile CPUs. This means Intel Pentium M (Centrino) CPUs
- or 64bit enabled Intel Xeons.
-
- To compile this driver as a module, choose M here: the
- module will be called speedstep-centrino.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_SPEEDSTEP_CENTRINO_TABLE
- bool "Built-in tables for Banias CPUs"
- depends on X86_32 && X86_SPEEDSTEP_CENTRINO
- default y
- help
- Use built-in tables for Banias CPUs if ACPI encoding
- is not available.
-
- If in doubt, say N.
-
-config X86_SPEEDSTEP_ICH
- tristate "Intel Speedstep on ICH-M chipsets (ioport interface)"
- select CPU_FREQ_TABLE
- depends on X86_32
- help
- This adds the CPUFreq driver for certain mobile Intel Pentium III
- (Coppermine), all mobile Intel Pentium III-M (Tualatin) and all
- mobile Intel Pentium 4 P4-M on systems which have an Intel ICH2,
- ICH3 or ICH4 southbridge.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_SPEEDSTEP_SMI
- tristate "Intel SpeedStep on 440BX/ZX/MX chipsets (SMI interface)"
- select CPU_FREQ_TABLE
- depends on X86_32 && EXPERIMENTAL
- help
- This adds the CPUFreq driver for certain mobile Intel Pentium III
- (Coppermine), all mobile Intel Pentium III-M (Tualatin)
- on systems which have an Intel 440BX/ZX/MX southbridge.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_P4_CLOCKMOD
- tristate "Intel Pentium 4 clock modulation"
- select CPU_FREQ_TABLE
- help
- This adds the CPUFreq driver for Intel Pentium 4 / XEON
- processors. When enabled it will lower CPU temperature by skipping
- clocks.
-
- This driver should be only used in exceptional
- circumstances when very low power is needed because it causes severe
- slowdowns and noticeable latencies. Normally Speedstep should be used
- instead.
-
- To compile this driver as a module, choose M here: the
- module will be called p4-clockmod.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- Unless you are absolutely sure say N.
-
-config X86_CPUFREQ_NFORCE2
- tristate "nVidia nForce2 FSB changing"
- depends on X86_32 && EXPERIMENTAL
- help
- This adds the CPUFreq driver for FSB changing on nVidia nForce2
- platforms.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_LONGRUN
- tristate "Transmeta LongRun"
- depends on X86_32
- help
- This adds the CPUFreq driver for Transmeta Crusoe and Efficeon processors
- which support LongRun.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_LONGHAUL
- tristate "VIA Cyrix III Longhaul"
- select CPU_FREQ_TABLE
- depends on X86_32 && ACPI_PROCESSOR
- help
- This adds the CPUFreq driver for VIA Samuel/CyrixIII,
- VIA Cyrix Samuel/C3, VIA Cyrix Ezra and VIA Cyrix Ezra-T
- processors.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_E_POWERSAVER
- tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)"
- select CPU_FREQ_TABLE
- depends on X86_32 && EXPERIMENTAL
- help
- This adds the CPUFreq driver for VIA C7 processors. However, this driver
- does not have any safeguards to prevent operating the CPU out of spec
- and is thus considered dangerous. Please use the regular ACPI cpufreq
- driver, enabled by CONFIG_X86_ACPI_CPUFREQ.
-
- If in doubt, say N.
-
-comment "shared options"
-
-config X86_SPEEDSTEP_LIB
- tristate
- default (X86_SPEEDSTEP_ICH || X86_SPEEDSTEP_SMI || X86_P4_CLOCKMOD)
-
-config X86_SPEEDSTEP_RELAXED_CAP_CHECK
- bool "Relaxed speedstep capability checks"
- depends on X86_32 && (X86_SPEEDSTEP_SMI || X86_SPEEDSTEP_ICH)
- help
- Don't perform all checks for a speedstep capable system which would
- normally be done. Some ancient or strange systems, though speedstep
- capable, don't always indicate that they are speedstep capable. This
- option lets the probing code bypass some of those checks if the
- parameter "relaxed_check=1" is passed to the module.
-
-endif # CPU_FREQ
-
-endmenu
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
deleted file mode 100644
index bd54bf67e6f..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ /dev/null
@@ -1,21 +0,0 @@
-# Link order matters. K8 is preferred to ACPI because of firmware bugs in early
-# K8 systems. ACPI is preferred to all other hardware-specific drivers.
-# speedstep-* is preferred over p4-clockmod.
-
-obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o mperf.o
-obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o mperf.o
-obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o
-obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o
-obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o
-obj-$(CONFIG_X86_LONGHAUL) += longhaul.o
-obj-$(CONFIG_X86_E_POWERSAVER) += e_powersaver.o
-obj-$(CONFIG_ELAN_CPUFREQ) += elanfreq.o
-obj-$(CONFIG_SC520_CPUFREQ) += sc520_freq.o
-obj-$(CONFIG_X86_LONGRUN) += longrun.o
-obj-$(CONFIG_X86_GX_SUSPMOD) += gx-suspmod.o
-obj-$(CONFIG_X86_SPEEDSTEP_ICH) += speedstep-ich.o
-obj-$(CONFIG_X86_SPEEDSTEP_LIB) += speedstep-lib.o
-obj-$(CONFIG_X86_SPEEDSTEP_SMI) += speedstep-smi.o
-obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o
-obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o
-obj-$(CONFIG_X86_CPUFREQ_NFORCE2) += cpufreq-nforce2.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
deleted file mode 100644
index a2baafb2fe6..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ /dev/null
@@ -1,776 +0,0 @@
-/*
- * acpi-cpufreq.c - ACPI Processor P-States Driver
- *
- * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
- * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
- * Copyright (C) 2002 - 2004 Dominik Brodowski <linux@brodo.de>
- * Copyright (C) 2006 Denis Sadykov <denis.m.sadykov@intel.com>
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/sched.h>
-#include <linux/cpufreq.h>
-#include <linux/compiler.h>
-#include <linux/dmi.h>
-#include <linux/slab.h>
-
-#include <linux/acpi.h>
-#include <linux/io.h>
-#include <linux/delay.h>
-#include <linux/uaccess.h>
-
-#include <acpi/processor.h>
-
-#include <asm/msr.h>
-#include <asm/processor.h>
-#include <asm/cpufeature.h>
-#include "mperf.h"
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "acpi-cpufreq", msg)
-
-MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski");
-MODULE_DESCRIPTION("ACPI Processor P-States Driver");
-MODULE_LICENSE("GPL");
-
-enum {
- UNDEFINED_CAPABLE = 0,
- SYSTEM_INTEL_MSR_CAPABLE,
- SYSTEM_IO_CAPABLE,
-};
-
-#define INTEL_MSR_RANGE (0xffff)
-
-struct acpi_cpufreq_data {
- struct acpi_processor_performance *acpi_data;
- struct cpufreq_frequency_table *freq_table;
- unsigned int resume;
- unsigned int cpu_feature;
-};
-
-static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
-
-/* acpi_perf_data is a pointer to percpu data. */
-static struct acpi_processor_performance __percpu *acpi_perf_data;
-
-static struct cpufreq_driver acpi_cpufreq_driver;
-
-static unsigned int acpi_pstate_strict;
-
-static int check_est_cpu(unsigned int cpuid)
-{
- struct cpuinfo_x86 *cpu = &cpu_data(cpuid);
-
- return cpu_has(cpu, X86_FEATURE_EST);
-}
-
-static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
-{
- struct acpi_processor_performance *perf;
- int i;
-
- perf = data->acpi_data;
-
- for (i = 0; i < perf->state_count; i++) {
- if (value == perf->states[i].status)
- return data->freq_table[i].frequency;
- }
- return 0;
-}
-
-static unsigned extract_msr(u32 msr, struct acpi_cpufreq_data *data)
-{
- int i;
- struct acpi_processor_performance *perf;
-
- msr &= INTEL_MSR_RANGE;
- perf = data->acpi_data;
-
- for (i = 0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
- if (msr == perf->states[data->freq_table[i].index].status)
- return data->freq_table[i].frequency;
- }
- return data->freq_table[0].frequency;
-}
-
-static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data)
-{
- switch (data->cpu_feature) {
- case SYSTEM_INTEL_MSR_CAPABLE:
- return extract_msr(val, data);
- case SYSTEM_IO_CAPABLE:
- return extract_io(val, data);
- default:
- return 0;
- }
-}
-
-struct msr_addr {
- u32 reg;
-};
-
-struct io_addr {
- u16 port;
- u8 bit_width;
-};
-
-struct drv_cmd {
- unsigned int type;
- const struct cpumask *mask;
- union {
- struct msr_addr msr;
- struct io_addr io;
- } addr;
- u32 val;
-};
-
-/* Called via smp_call_function_single(), on the target CPU */
-static void do_drv_read(void *_cmd)
-{
- struct drv_cmd *cmd = _cmd;
- u32 h;
-
- switch (cmd->type) {
- case SYSTEM_INTEL_MSR_CAPABLE:
- rdmsr(cmd->addr.msr.reg, cmd->val, h);
- break;
- case SYSTEM_IO_CAPABLE:
- acpi_os_read_port((acpi_io_address)cmd->addr.io.port,
- &cmd->val,
- (u32)cmd->addr.io.bit_width);
- break;
- default:
- break;
- }
-}
-
-/* Called via smp_call_function_many(), on the target CPUs */
-static void do_drv_write(void *_cmd)
-{
- struct drv_cmd *cmd = _cmd;
- u32 lo, hi;
-
- switch (cmd->type) {
- case SYSTEM_INTEL_MSR_CAPABLE:
- rdmsr(cmd->addr.msr.reg, lo, hi);
- lo = (lo & ~INTEL_MSR_RANGE) | (cmd->val & INTEL_MSR_RANGE);
- wrmsr(cmd->addr.msr.reg, lo, hi);
- break;
- case SYSTEM_IO_CAPABLE:
- acpi_os_write_port((acpi_io_address)cmd->addr.io.port,
- cmd->val,
- (u32)cmd->addr.io.bit_width);
- break;
- default:
- break;
- }
-}
-
-static void drv_read(struct drv_cmd *cmd)
-{
- int err;
- cmd->val = 0;
-
- err = smp_call_function_any(cmd->mask, do_drv_read, cmd, 1);
- WARN_ON_ONCE(err); /* smp_call_function_any() was buggy? */
-}
-
-static void drv_write(struct drv_cmd *cmd)
-{
- int this_cpu;
-
- this_cpu = get_cpu();
- if (cpumask_test_cpu(this_cpu, cmd->mask))
- do_drv_write(cmd);
- smp_call_function_many(cmd->mask, do_drv_write, cmd, 1);
- put_cpu();
-}
-
-static u32 get_cur_val(const struct cpumask *mask)
-{
- struct acpi_processor_performance *perf;
- struct drv_cmd cmd;
-
- if (unlikely(cpumask_empty(mask)))
- return 0;
-
- switch (per_cpu(acfreq_data, cpumask_first(mask))->cpu_feature) {
- case SYSTEM_INTEL_MSR_CAPABLE:
- cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
- cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
- break;
- case SYSTEM_IO_CAPABLE:
- cmd.type = SYSTEM_IO_CAPABLE;
- perf = per_cpu(acfreq_data, cpumask_first(mask))->acpi_data;
- cmd.addr.io.port = perf->control_register.address;
- cmd.addr.io.bit_width = perf->control_register.bit_width;
- break;
- default:
- return 0;
- }
-
- cmd.mask = mask;
- drv_read(&cmd);
-
- dprintk("get_cur_val = %u\n", cmd.val);
-
- return cmd.val;
-}
-
-static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
-{
- struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu);
- unsigned int freq;
- unsigned int cached_freq;
-
- dprintk("get_cur_freq_on_cpu (%d)\n", cpu);
-
- if (unlikely(data == NULL ||
- data->acpi_data == NULL || data->freq_table == NULL)) {
- return 0;
- }
-
- cached_freq = data->freq_table[data->acpi_data->state].frequency;
- freq = extract_freq(get_cur_val(cpumask_of(cpu)), data);
- if (freq != cached_freq) {
- /*
- * The dreaded BIOS frequency change behind our back.
- * Force set the frequency on next target call.
- */
- data->resume = 1;
- }
-
- dprintk("cur freq = %u\n", freq);
-
- return freq;
-}
-
-static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq,
- struct acpi_cpufreq_data *data)
-{
- unsigned int cur_freq;
- unsigned int i;
-
- for (i = 0; i < 100; i++) {
- cur_freq = extract_freq(get_cur_val(mask), data);
- if (cur_freq == freq)
- return 1;
- udelay(10);
- }
- return 0;
-}
-
-static int acpi_cpufreq_target(struct cpufreq_policy *policy,
- unsigned int target_freq, unsigned int relation)
-{
- struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
- struct acpi_processor_performance *perf;
- struct cpufreq_freqs freqs;
- struct drv_cmd cmd;
- unsigned int next_state = 0; /* Index into freq_table */
- unsigned int next_perf_state = 0; /* Index into perf table */
- unsigned int i;
- int result = 0;
-
- dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
-
- if (unlikely(data == NULL ||
- data->acpi_data == NULL || data->freq_table == NULL)) {
- return -ENODEV;
- }
-
- perf = data->acpi_data;
- result = cpufreq_frequency_table_target(policy,
- data->freq_table,
- target_freq,
- relation, &next_state);
- if (unlikely(result)) {
- result = -ENODEV;
- goto out;
- }
-
- next_perf_state = data->freq_table[next_state].index;
- if (perf->state == next_perf_state) {
- if (unlikely(data->resume)) {
- dprintk("Called after resume, resetting to P%d\n",
- next_perf_state);
- data->resume = 0;
- } else {
- dprintk("Already at target state (P%d)\n",
- next_perf_state);
- goto out;
- }
- }
-
- switch (data->cpu_feature) {
- case SYSTEM_INTEL_MSR_CAPABLE:
- cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
- cmd.addr.msr.reg = MSR_IA32_PERF_CTL;
- cmd.val = (u32) perf->states[next_perf_state].control;
- break;
- case SYSTEM_IO_CAPABLE:
- cmd.type = SYSTEM_IO_CAPABLE;
- cmd.addr.io.port = perf->control_register.address;
- cmd.addr.io.bit_width = perf->control_register.bit_width;
- cmd.val = (u32) perf->states[next_perf_state].control;
- break;
- default:
- result = -ENODEV;
- goto out;
- }
-
- /* cpufreq holds the hotplug lock, so we are safe from here on */
- if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY)
- cmd.mask = policy->cpus;
- else
- cmd.mask = cpumask_of(policy->cpu);
-
- freqs.old = perf->states[perf->state].core_frequency * 1000;
- freqs.new = data->freq_table[next_state].frequency;
- for_each_cpu(i, policy->cpus) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
- }
-
- drv_write(&cmd);
-
- if (acpi_pstate_strict) {
- if (!check_freqs(cmd.mask, freqs.new, data)) {
- dprintk("acpi_cpufreq_target failed (%d)\n",
- policy->cpu);
- result = -EAGAIN;
- goto out;
- }
- }
-
- for_each_cpu(i, policy->cpus) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- }
- perf->state = next_perf_state;
-
-out:
- return result;
-}
-
-static int acpi_cpufreq_verify(struct cpufreq_policy *policy)
-{
- struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
-
- dprintk("acpi_cpufreq_verify\n");
-
- return cpufreq_frequency_table_verify(policy, data->freq_table);
-}
-
-static unsigned long
-acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu)
-{
- struct acpi_processor_performance *perf = data->acpi_data;
-
- if (cpu_khz) {
- /* search the closest match to cpu_khz */
- unsigned int i;
- unsigned long freq;
- unsigned long freqn = perf->states[0].core_frequency * 1000;
-
- for (i = 0; i < (perf->state_count-1); i++) {
- freq = freqn;
- freqn = perf->states[i+1].core_frequency * 1000;
- if ((2 * cpu_khz) > (freqn + freq)) {
- perf->state = i;
- return freq;
- }
- }
- perf->state = perf->state_count-1;
- return freqn;
- } else {
- /* assume CPU is at P0... */
- perf->state = 0;
- return perf->states[0].core_frequency * 1000;
- }
-}
-
-static void free_acpi_perf_data(void)
-{
- unsigned int i;
-
- /* Freeing a NULL pointer is OK, and alloc_percpu zeroes. */
- for_each_possible_cpu(i)
- free_cpumask_var(per_cpu_ptr(acpi_perf_data, i)
- ->shared_cpu_map);
- free_percpu(acpi_perf_data);
-}
-
-/*
- * acpi_cpufreq_early_init - initialize ACPI P-States library
- *
- * Initialize the ACPI P-States library (drivers/acpi/processor_perflib.c)
- * in order to determine correct frequency and voltage pairings. We can
- * do _PDC and _PSD and find out the processor dependency for the
- * actual init that will happen later...
- */
-static int __init acpi_cpufreq_early_init(void)
-{
- unsigned int i;
- dprintk("acpi_cpufreq_early_init\n");
-
- acpi_perf_data = alloc_percpu(struct acpi_processor_performance);
- if (!acpi_perf_data) {
- dprintk("Memory allocation error for acpi_perf_data.\n");
- return -ENOMEM;
- }
- for_each_possible_cpu(i) {
- if (!zalloc_cpumask_var_node(
- &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map,
- GFP_KERNEL, cpu_to_node(i))) {
-
- /* Freeing a NULL pointer is OK: alloc_percpu zeroes. */
- free_acpi_perf_data();
- return -ENOMEM;
- }
- }
-
- /* Do initialization in ACPI core */
- acpi_processor_preregister_performance(acpi_perf_data);
- return 0;
-}
-
-#ifdef CONFIG_SMP
-/*
- * Some BIOSes do SW_ANY coordination internally, either set it up in hw
- * or do it in BIOS firmware and won't inform about it to OS. If not
- * detected, this has a side effect of making CPU run at a different speed
- * than OS intended it to run at. Detect it and handle it cleanly.
- */
-static int bios_with_sw_any_bug;
-
-static int sw_any_bug_found(const struct dmi_system_id *d)
-{
- bios_with_sw_any_bug = 1;
- return 0;
-}
-
-static const struct dmi_system_id sw_any_bug_dmi_table[] = {
- {
- .callback = sw_any_bug_found,
- .ident = "Supermicro Server X6DLP",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Supermicro"),
- DMI_MATCH(DMI_BIOS_VERSION, "080010"),
- DMI_MATCH(DMI_PRODUCT_NAME, "X6DLP"),
- },
- },
- { }
-};
-
-static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c)
-{
- /* Intel Xeon Processor 7100 Series Specification Update
- * http://www.intel.com/Assets/PDF/specupdate/314554.pdf
- * AL30: A Machine Check Exception (MCE) Occurring during an
- * Enhanced Intel SpeedStep Technology Ratio Change May Cause
- * Both Processor Cores to Lock Up. */
- if (c->x86_vendor == X86_VENDOR_INTEL) {
- if ((c->x86 == 15) &&
- (c->x86_model == 6) &&
- (c->x86_mask == 8)) {
- printk(KERN_INFO "acpi-cpufreq: Intel(R) "
- "Xeon(R) 7100 Errata AL30, processors may "
- "lock up on frequency changes: disabling "
- "acpi-cpufreq.\n");
- return -ENODEV;
- }
- }
- return 0;
-}
-#endif
-
-static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
-{
- unsigned int i;
- unsigned int valid_states = 0;
- unsigned int cpu = policy->cpu;
- struct acpi_cpufreq_data *data;
- unsigned int result = 0;
- struct cpuinfo_x86 *c = &cpu_data(policy->cpu);
- struct acpi_processor_performance *perf;
-#ifdef CONFIG_SMP
- static int blacklisted;
-#endif
-
- dprintk("acpi_cpufreq_cpu_init\n");
-
-#ifdef CONFIG_SMP
- if (blacklisted)
- return blacklisted;
- blacklisted = acpi_cpufreq_blacklist(c);
- if (blacklisted)
- return blacklisted;
-#endif
-
- data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL);
- if (!data)
- return -ENOMEM;
-
- data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
- per_cpu(acfreq_data, cpu) = data;
-
- if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
- acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS;
-
- result = acpi_processor_register_performance(data->acpi_data, cpu);
- if (result)
- goto err_free;
-
- perf = data->acpi_data;
- policy->shared_type = perf->shared_type;
-
- /*
- * Will let policy->cpus know about dependency only when software
- * coordination is required.
- */
- if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL ||
- policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
- cpumask_copy(policy->cpus, perf->shared_cpu_map);
- }
- cpumask_copy(policy->related_cpus, perf->shared_cpu_map);
-
-#ifdef CONFIG_SMP
- dmi_check_system(sw_any_bug_dmi_table);
- if (bios_with_sw_any_bug && cpumask_weight(policy->cpus) == 1) {
- policy->shared_type = CPUFREQ_SHARED_TYPE_ALL;
- cpumask_copy(policy->cpus, cpu_core_mask(cpu));
- }
-#endif
-
- /* capability check */
- if (perf->state_count <= 1) {
- dprintk("No P-States\n");
- result = -ENODEV;
- goto err_unreg;
- }
-
- if (perf->control_register.space_id != perf->status_register.space_id) {
- result = -ENODEV;
- goto err_unreg;
- }
-
- switch (perf->control_register.space_id) {
- case ACPI_ADR_SPACE_SYSTEM_IO:
- dprintk("SYSTEM IO addr space\n");
- data->cpu_feature = SYSTEM_IO_CAPABLE;
- break;
- case ACPI_ADR_SPACE_FIXED_HARDWARE:
- dprintk("HARDWARE addr space\n");
- if (!check_est_cpu(cpu)) {
- result = -ENODEV;
- goto err_unreg;
- }
- data->cpu_feature = SYSTEM_INTEL_MSR_CAPABLE;
- break;
- default:
- dprintk("Unknown addr space %d\n",
- (u32) (perf->control_register.space_id));
- result = -ENODEV;
- goto err_unreg;
- }
-
- data->freq_table = kmalloc(sizeof(struct cpufreq_frequency_table) *
- (perf->state_count+1), GFP_KERNEL);
- if (!data->freq_table) {
- result = -ENOMEM;
- goto err_unreg;
- }
-
- /* detect transition latency */
- policy->cpuinfo.transition_latency = 0;
- for (i = 0; i < perf->state_count; i++) {
- if ((perf->states[i].transition_latency * 1000) >
- policy->cpuinfo.transition_latency)
- policy->cpuinfo.transition_latency =
- perf->states[i].transition_latency * 1000;
- }
-
- /* Check for high latency (>20uS) from buggy BIOSes, like on T42 */
- if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE &&
- policy->cpuinfo.transition_latency > 20 * 1000) {
- policy->cpuinfo.transition_latency = 20 * 1000;
- printk_once(KERN_INFO
- "P-state transition latency capped at 20 uS\n");
- }
-
- /* table init */
- for (i = 0; i < perf->state_count; i++) {
- if (i > 0 && perf->states[i].core_frequency >=
- data->freq_table[valid_states-1].frequency / 1000)
- continue;
-
- data->freq_table[valid_states].index = i;
- data->freq_table[valid_states].frequency =
- perf->states[i].core_frequency * 1000;
- valid_states++;
- }
- data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END;
- perf->state = 0;
-
- result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table);
- if (result)
- goto err_freqfree;
-
- if (perf->states[0].core_frequency * 1000 != policy->cpuinfo.max_freq)
- printk(KERN_WARNING FW_WARN "P-state 0 is not max freq\n");
-
- switch (perf->control_register.space_id) {
- case ACPI_ADR_SPACE_SYSTEM_IO:
- /* Current speed is unknown and not detectable by IO port */
- policy->cur = acpi_cpufreq_guess_freq(data, policy->cpu);
- break;
- case ACPI_ADR_SPACE_FIXED_HARDWARE:
- acpi_cpufreq_driver.get = get_cur_freq_on_cpu;
- policy->cur = get_cur_freq_on_cpu(cpu);
- break;
- default:
- break;
- }
-
- /* notify BIOS that we exist */
- acpi_processor_notify_smm(THIS_MODULE);
-
- /* Check for APERF/MPERF support in hardware */
- if (cpu_has(c, X86_FEATURE_APERFMPERF))
- acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf;
-
- dprintk("CPU%u - ACPI performance management activated.\n", cpu);
- for (i = 0; i < perf->state_count; i++)
- dprintk(" %cP%d: %d MHz, %d mW, %d uS\n",
- (i == perf->state ? '*' : ' '), i,
- (u32) perf->states[i].core_frequency,
- (u32) perf->states[i].power,
- (u32) perf->states[i].transition_latency);
-
- cpufreq_frequency_table_get_attr(data->freq_table, policy->cpu);
-
- /*
- * the first call to ->target() should result in us actually
- * writing something to the appropriate registers.
- */
- data->resume = 1;
-
- return result;
-
-err_freqfree:
- kfree(data->freq_table);
-err_unreg:
- acpi_processor_unregister_performance(perf, cpu);
-err_free:
- kfree(data);
- per_cpu(acfreq_data, cpu) = NULL;
-
- return result;
-}
-
-static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
-{
- struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
-
- dprintk("acpi_cpufreq_cpu_exit\n");
-
- if (data) {
- cpufreq_frequency_table_put_attr(policy->cpu);
- per_cpu(acfreq_data, policy->cpu) = NULL;
- acpi_processor_unregister_performance(data->acpi_data,
- policy->cpu);
- kfree(data->freq_table);
- kfree(data);
- }
-
- return 0;
-}
-
-static int acpi_cpufreq_resume(struct cpufreq_policy *policy)
-{
- struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
-
- dprintk("acpi_cpufreq_resume\n");
-
- data->resume = 1;
-
- return 0;
-}
-
-static struct freq_attr *acpi_cpufreq_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-static struct cpufreq_driver acpi_cpufreq_driver = {
- .verify = acpi_cpufreq_verify,
- .target = acpi_cpufreq_target,
- .bios_limit = acpi_processor_get_bios_limit,
- .init = acpi_cpufreq_cpu_init,
- .exit = acpi_cpufreq_cpu_exit,
- .resume = acpi_cpufreq_resume,
- .name = "acpi-cpufreq",
- .owner = THIS_MODULE,
- .attr = acpi_cpufreq_attr,
-};
-
-static int __init acpi_cpufreq_init(void)
-{
- int ret;
-
- if (acpi_disabled)
- return 0;
-
- dprintk("acpi_cpufreq_init\n");
-
- ret = acpi_cpufreq_early_init();
- if (ret)
- return ret;
-
- ret = cpufreq_register_driver(&acpi_cpufreq_driver);
- if (ret)
- free_acpi_perf_data();
-
- return ret;
-}
-
-static void __exit acpi_cpufreq_exit(void)
-{
- dprintk("acpi_cpufreq_exit\n");
-
- cpufreq_unregister_driver(&acpi_cpufreq_driver);
-
- free_percpu(acpi_perf_data);
-}
-
-module_param(acpi_pstate_strict, uint, 0644);
-MODULE_PARM_DESC(acpi_pstate_strict,
- "value 0 or non-zero. non-zero -> strict ACPI checks are "
- "performed during frequency changes.");
-
-late_initcall(acpi_cpufreq_init);
-module_exit(acpi_cpufreq_exit);
-
-MODULE_ALIAS("acpi");
diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
deleted file mode 100644
index 141abebc451..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
+++ /dev/null
@@ -1,446 +0,0 @@
-/*
- * (C) 2004-2006 Sebastian Witt <se.witt@gmx.net>
- *
- * Licensed under the terms of the GNU GPL License version 2.
- * Based upon reverse engineered information
- *
- * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/pci.h>
-#include <linux/delay.h>
-
-#define NFORCE2_XTAL 25
-#define NFORCE2_BOOTFSB 0x48
-#define NFORCE2_PLLENABLE 0xa8
-#define NFORCE2_PLLREG 0xa4
-#define NFORCE2_PLLADR 0xa0
-#define NFORCE2_PLL(mul, div) (0x100000 | (mul << 8) | div)
-
-#define NFORCE2_MIN_FSB 50
-#define NFORCE2_SAFE_DISTANCE 50
-
-/* Delay in ms between FSB changes */
-/* #define NFORCE2_DELAY 10 */
-
-/*
- * nforce2_chipset:
- * FSB is changed using the chipset
- */
-static struct pci_dev *nforce2_dev;
-
-/* fid:
- * multiplier * 10
- */
-static int fid;
-
-/* min_fsb, max_fsb:
- * minimum and maximum FSB (= FSB at boot time)
- */
-static int min_fsb;
-static int max_fsb;
-
-MODULE_AUTHOR("Sebastian Witt <se.witt@gmx.net>");
-MODULE_DESCRIPTION("nForce2 FSB changing cpufreq driver");
-MODULE_LICENSE("GPL");
-
-module_param(fid, int, 0444);
-module_param(min_fsb, int, 0444);
-
-MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)");
-MODULE_PARM_DESC(min_fsb,
- "Minimum FSB to use, if not defined: current FSB - 50");
-
-#define PFX "cpufreq-nforce2: "
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "cpufreq-nforce2", msg)
-
-/**
- * nforce2_calc_fsb - calculate FSB
- * @pll: PLL value
- *
- * Calculates FSB from PLL value
- */
-static int nforce2_calc_fsb(int pll)
-{
- unsigned char mul, div;
-
- mul = (pll >> 8) & 0xff;
- div = pll & 0xff;
-
- if (div > 0)
- return NFORCE2_XTAL * mul / div;
-
- return 0;
-}
-
-/**
- * nforce2_calc_pll - calculate PLL value
- * @fsb: FSB
- *
- * Calculate PLL value for given FSB
- */
-static int nforce2_calc_pll(unsigned int fsb)
-{
- unsigned char xmul, xdiv;
- unsigned char mul = 0, div = 0;
- int tried = 0;
-
- /* Try to calculate multiplier and divider up to 4 times */
- while (((mul == 0) || (div == 0)) && (tried <= 3)) {
- for (xdiv = 2; xdiv <= 0x80; xdiv++)
- for (xmul = 1; xmul <= 0xfe; xmul++)
- if (nforce2_calc_fsb(NFORCE2_PLL(xmul, xdiv)) ==
- fsb + tried) {
- mul = xmul;
- div = xdiv;
- }
- tried++;
- }
-
- if ((mul == 0) || (div == 0))
- return -1;
-
- return NFORCE2_PLL(mul, div);
-}
-
-/**
- * nforce2_write_pll - write PLL value to chipset
- * @pll: PLL value
- *
- * Writes new FSB PLL value to chipset
- */
-static void nforce2_write_pll(int pll)
-{
- int temp;
-
- /* Set the pll addr. to 0x00 */
- pci_write_config_dword(nforce2_dev, NFORCE2_PLLADR, 0);
-
- /* Now write the value in all 64 registers */
- for (temp = 0; temp <= 0x3f; temp++)
- pci_write_config_dword(nforce2_dev, NFORCE2_PLLREG, pll);
-
- return;
-}
-
-/**
- * nforce2_fsb_read - Read FSB
- *
- * Read FSB from chipset
- * If bootfsb != 0, return FSB at boot-time
- */
-static unsigned int nforce2_fsb_read(int bootfsb)
-{
- struct pci_dev *nforce2_sub5;
- u32 fsb, temp = 0;
-
- /* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */
- nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, 0x01EF,
- PCI_ANY_ID, PCI_ANY_ID, NULL);
- if (!nforce2_sub5)
- return 0;
-
- pci_read_config_dword(nforce2_sub5, NFORCE2_BOOTFSB, &fsb);
- fsb /= 1000000;
-
- /* Check if PLL register is already set */
- pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
-
- if (bootfsb || !temp)
- return fsb;
-
- /* Use PLL register FSB value */
- pci_read_config_dword(nforce2_dev, NFORCE2_PLLREG, &temp);
- fsb = nforce2_calc_fsb(temp);
-
- return fsb;
-}
-
-/**
- * nforce2_set_fsb - set new FSB
- * @fsb: New FSB
- *
- * Sets new FSB
- */
-static int nforce2_set_fsb(unsigned int fsb)
-{
- u32 temp = 0;
- unsigned int tfsb;
- int diff;
- int pll = 0;
-
- if ((fsb > max_fsb) || (fsb < NFORCE2_MIN_FSB)) {
- printk(KERN_ERR PFX "FSB %d is out of range!\n", fsb);
- return -EINVAL;
- }
-
- tfsb = nforce2_fsb_read(0);
- if (!tfsb) {
- printk(KERN_ERR PFX "Error while reading the FSB\n");
- return -EINVAL;
- }
-
- /* First write? Then set actual value */
- pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
- if (!temp) {
- pll = nforce2_calc_pll(tfsb);
-
- if (pll < 0)
- return -EINVAL;
-
- nforce2_write_pll(pll);
- }
-
- /* Enable write access */
- temp = 0x01;
- pci_write_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8)temp);
-
- diff = tfsb - fsb;
-
- if (!diff)
- return 0;
-
- while ((tfsb != fsb) && (tfsb <= max_fsb) && (tfsb >= min_fsb)) {
- if (diff < 0)
- tfsb++;
- else
- tfsb--;
-
- /* Calculate the PLL reg. value */
- pll = nforce2_calc_pll(tfsb);
- if (pll == -1)
- return -EINVAL;
-
- nforce2_write_pll(pll);
-#ifdef NFORCE2_DELAY
- mdelay(NFORCE2_DELAY);
-#endif
- }
-
- temp = 0x40;
- pci_write_config_byte(nforce2_dev, NFORCE2_PLLADR, (u8)temp);
-
- return 0;
-}
-
-/**
- * nforce2_get - get the CPU frequency
- * @cpu: CPU number
- *
- * Returns the CPU frequency
- */
-static unsigned int nforce2_get(unsigned int cpu)
-{
- if (cpu)
- return 0;
- return nforce2_fsb_read(0) * fid * 100;
-}
-
-/**
- * nforce2_target - set a new CPUFreq policy
- * @policy: new policy
- * @target_freq: the target frequency
- * @relation: how that frequency relates to achieved frequency
- * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
- *
- * Sets a new CPUFreq policy.
- */
-static int nforce2_target(struct cpufreq_policy *policy,
- unsigned int target_freq, unsigned int relation)
-{
-/* unsigned long flags; */
- struct cpufreq_freqs freqs;
- unsigned int target_fsb;
-
- if ((target_freq > policy->max) || (target_freq < policy->min))
- return -EINVAL;
-
- target_fsb = target_freq / (fid * 100);
-
- freqs.old = nforce2_get(policy->cpu);
- freqs.new = target_fsb * fid * 100;
- freqs.cpu = 0; /* Only one CPU on nForce2 platforms */
-
- if (freqs.old == freqs.new)
- return 0;
-
- dprintk("Old CPU frequency %d kHz, new %d kHz\n",
- freqs.old, freqs.new);
-
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
- /* Disable IRQs */
- /* local_irq_save(flags); */
-
- if (nforce2_set_fsb(target_fsb) < 0)
- printk(KERN_ERR PFX "Changing FSB to %d failed\n",
- target_fsb);
- else
- dprintk("Changed FSB successfully to %d\n",
- target_fsb);
-
- /* Enable IRQs */
- /* local_irq_restore(flags); */
-
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-
- return 0;
-}
-
-/**
- * nforce2_verify - verifies a new CPUFreq policy
- * @policy: new policy
- */
-static int nforce2_verify(struct cpufreq_policy *policy)
-{
- unsigned int fsb_pol_max;
-
- fsb_pol_max = policy->max / (fid * 100);
-
- if (policy->min < (fsb_pol_max * fid * 100))
- policy->max = (fsb_pol_max + 1) * fid * 100;
-
- cpufreq_verify_within_limits(policy,
- policy->cpuinfo.min_freq,
- policy->cpuinfo.max_freq);
- return 0;
-}
-
-static int nforce2_cpu_init(struct cpufreq_policy *policy)
-{
- unsigned int fsb;
- unsigned int rfid;
-
- /* capability check */
- if (policy->cpu != 0)
- return -ENODEV;
-
- /* Get current FSB */
- fsb = nforce2_fsb_read(0);
-
- if (!fsb)
- return -EIO;
-
- /* FIX: Get FID from CPU */
- if (!fid) {
- if (!cpu_khz) {
- printk(KERN_WARNING PFX
- "cpu_khz not set, can't calculate multiplier!\n");
- return -ENODEV;
- }
-
- fid = cpu_khz / (fsb * 100);
- rfid = fid % 5;
-
- if (rfid) {
- if (rfid > 2)
- fid += 5 - rfid;
- else
- fid -= rfid;
- }
- }
-
- printk(KERN_INFO PFX "FSB currently at %i MHz, FID %d.%d\n", fsb,
- fid / 10, fid % 10);
-
- /* Set maximum FSB to FSB at boot time */
- max_fsb = nforce2_fsb_read(1);
-
- if (!max_fsb)
- return -EIO;
-
- if (!min_fsb)
- min_fsb = max_fsb - NFORCE2_SAFE_DISTANCE;
-
- if (min_fsb < NFORCE2_MIN_FSB)
- min_fsb = NFORCE2_MIN_FSB;
-
- /* cpuinfo and default policy values */
- policy->cpuinfo.min_freq = min_fsb * fid * 100;
- policy->cpuinfo.max_freq = max_fsb * fid * 100;
- policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
- policy->cur = nforce2_get(policy->cpu);
- policy->min = policy->cpuinfo.min_freq;
- policy->max = policy->cpuinfo.max_freq;
-
- return 0;
-}
-
-static int nforce2_cpu_exit(struct cpufreq_policy *policy)
-{
- return 0;
-}
-
-static struct cpufreq_driver nforce2_driver = {
- .name = "nforce2",
- .verify = nforce2_verify,
- .target = nforce2_target,
- .get = nforce2_get,
- .init = nforce2_cpu_init,
- .exit = nforce2_cpu_exit,
- .owner = THIS_MODULE,
-};
-
-/**
- * nforce2_detect_chipset - detect the Southbridge which contains FSB PLL logic
- *
- * Detects nForce2 A2 and C1 stepping
- *
- */
-static int nforce2_detect_chipset(void)
-{
- nforce2_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA,
- PCI_DEVICE_ID_NVIDIA_NFORCE2,
- PCI_ANY_ID, PCI_ANY_ID, NULL);
-
- if (nforce2_dev == NULL)
- return -ENODEV;
-
- printk(KERN_INFO PFX "Detected nForce2 chipset revision %X\n",
- nforce2_dev->revision);
- printk(KERN_INFO PFX
- "FSB changing is maybe unstable and can lead to "
- "crashes and data loss.\n");
-
- return 0;
-}
-
-/**
- * nforce2_init - initializes the nForce2 CPUFreq driver
- *
- * Initializes the nForce2 FSB support. Returns -ENODEV on unsupported
- * devices, -EINVAL on problems during initiatization, and zero on
- * success.
- */
-static int __init nforce2_init(void)
-{
- /* TODO: do we need to detect the processor? */
-
- /* detect chipset */
- if (nforce2_detect_chipset()) {
- printk(KERN_INFO PFX "No nForce2 chipset.\n");
- return -ENODEV;
- }
-
- return cpufreq_register_driver(&nforce2_driver);
-}
-
-/**
- * nforce2_exit - unregisters cpufreq module
- *
- * Unregisters nForce2 FSB change support.
- */
-static void __exit nforce2_exit(void)
-{
- cpufreq_unregister_driver(&nforce2_driver);
-}
-
-module_init(nforce2_init);
-module_exit(nforce2_exit);
-
diff --git a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
deleted file mode 100644
index 35a257dd4bb..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
+++ /dev/null
@@ -1,367 +0,0 @@
-/*
- * Based on documentation provided by Dave Jones. Thanks!
- *
- * Licensed under the terms of the GNU GPL License version 2.
- *
- * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/ioport.h>
-#include <linux/slab.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-#include <linux/delay.h>
-
-#include <asm/msr.h>
-#include <asm/tsc.h>
-
-#define EPS_BRAND_C7M 0
-#define EPS_BRAND_C7 1
-#define EPS_BRAND_EDEN 2
-#define EPS_BRAND_C3 3
-#define EPS_BRAND_C7D 4
-
-struct eps_cpu_data {
- u32 fsb;
- struct cpufreq_frequency_table freq_table[];
-};
-
-static struct eps_cpu_data *eps_cpu[NR_CPUS];
-
-
-static unsigned int eps_get(unsigned int cpu)
-{
- struct eps_cpu_data *centaur;
- u32 lo, hi;
-
- if (cpu)
- return 0;
- centaur = eps_cpu[cpu];
- if (centaur == NULL)
- return 0;
-
- /* Return current frequency */
- rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
- return centaur->fsb * ((lo >> 8) & 0xff);
-}
-
-static int eps_set_state(struct eps_cpu_data *centaur,
- unsigned int cpu,
- u32 dest_state)
-{
- struct cpufreq_freqs freqs;
- u32 lo, hi;
- int err = 0;
- int i;
-
- freqs.old = eps_get(cpu);
- freqs.new = centaur->fsb * ((dest_state >> 8) & 0xff);
- freqs.cpu = cpu;
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
- /* Wait while CPU is busy */
- rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
- i = 0;
- while (lo & ((1 << 16) | (1 << 17))) {
- udelay(16);
- rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
- i++;
- if (unlikely(i > 64)) {
- err = -ENODEV;
- goto postchange;
- }
- }
- /* Set new multiplier and voltage */
- wrmsr(MSR_IA32_PERF_CTL, dest_state & 0xffff, 0);
- /* Wait until transition end */
- i = 0;
- do {
- udelay(16);
- rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
- i++;
- if (unlikely(i > 64)) {
- err = -ENODEV;
- goto postchange;
- }
- } while (lo & ((1 << 16) | (1 << 17)));
-
- /* Return current frequency */
-postchange:
- rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
- freqs.new = centaur->fsb * ((lo >> 8) & 0xff);
-
-#ifdef DEBUG
- {
- u8 current_multiplier, current_voltage;
-
- /* Print voltage and multiplier */
- rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
- current_voltage = lo & 0xff;
- printk(KERN_INFO "eps: Current voltage = %dmV\n",
- current_voltage * 16 + 700);
- current_multiplier = (lo >> 8) & 0xff;
- printk(KERN_INFO "eps: Current multiplier = %d\n",
- current_multiplier);
- }
-#endif
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- return err;
-}
-
-static int eps_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- struct eps_cpu_data *centaur;
- unsigned int newstate = 0;
- unsigned int cpu = policy->cpu;
- unsigned int dest_state;
- int ret;
-
- if (unlikely(eps_cpu[cpu] == NULL))
- return -ENODEV;
- centaur = eps_cpu[cpu];
-
- if (unlikely(cpufreq_frequency_table_target(policy,
- &eps_cpu[cpu]->freq_table[0],
- target_freq,
- relation,
- &newstate))) {
- return -EINVAL;
- }
-
- /* Make frequency transition */
- dest_state = centaur->freq_table[newstate].index & 0xffff;
- ret = eps_set_state(centaur, cpu, dest_state);
- if (ret)
- printk(KERN_ERR "eps: Timeout!\n");
- return ret;
-}
-
-static int eps_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy,
- &eps_cpu[policy->cpu]->freq_table[0]);
-}
-
-static int eps_cpu_init(struct cpufreq_policy *policy)
-{
- unsigned int i;
- u32 lo, hi;
- u64 val;
- u8 current_multiplier, current_voltage;
- u8 max_multiplier, max_voltage;
- u8 min_multiplier, min_voltage;
- u8 brand = 0;
- u32 fsb;
- struct eps_cpu_data *centaur;
- struct cpuinfo_x86 *c = &cpu_data(0);
- struct cpufreq_frequency_table *f_table;
- int k, step, voltage;
- int ret;
- int states;
-
- if (policy->cpu != 0)
- return -ENODEV;
-
- /* Check brand */
- printk(KERN_INFO "eps: Detected VIA ");
-
- switch (c->x86_model) {
- case 10:
- rdmsr(0x1153, lo, hi);
- brand = (((lo >> 2) ^ lo) >> 18) & 3;
- printk(KERN_CONT "Model A ");
- break;
- case 13:
- rdmsr(0x1154, lo, hi);
- brand = (((lo >> 4) ^ (lo >> 2))) & 0x000000ff;
- printk(KERN_CONT "Model D ");
- break;
- }
-
- switch (brand) {
- case EPS_BRAND_C7M:
- printk(KERN_CONT "C7-M\n");
- break;
- case EPS_BRAND_C7:
- printk(KERN_CONT "C7\n");
- break;
- case EPS_BRAND_EDEN:
- printk(KERN_CONT "Eden\n");
- break;
- case EPS_BRAND_C7D:
- printk(KERN_CONT "C7-D\n");
- break;
- case EPS_BRAND_C3:
- printk(KERN_CONT "C3\n");
- return -ENODEV;
- break;
- }
- /* Enable Enhanced PowerSaver */
- rdmsrl(MSR_IA32_MISC_ENABLE, val);
- if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
- val |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;
- wrmsrl(MSR_IA32_MISC_ENABLE, val);
- /* Can be locked at 0 */
- rdmsrl(MSR_IA32_MISC_ENABLE, val);
- if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
- printk(KERN_INFO "eps: Can't enable Enhanced PowerSaver\n");
- return -ENODEV;
- }
- }
-
- /* Print voltage and multiplier */
- rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
- current_voltage = lo & 0xff;
- printk(KERN_INFO "eps: Current voltage = %dmV\n",
- current_voltage * 16 + 700);
- current_multiplier = (lo >> 8) & 0xff;
- printk(KERN_INFO "eps: Current multiplier = %d\n", current_multiplier);
-
- /* Print limits */
- max_voltage = hi & 0xff;
- printk(KERN_INFO "eps: Highest voltage = %dmV\n",
- max_voltage * 16 + 700);
- max_multiplier = (hi >> 8) & 0xff;
- printk(KERN_INFO "eps: Highest multiplier = %d\n", max_multiplier);
- min_voltage = (hi >> 16) & 0xff;
- printk(KERN_INFO "eps: Lowest voltage = %dmV\n",
- min_voltage * 16 + 700);
- min_multiplier = (hi >> 24) & 0xff;
- printk(KERN_INFO "eps: Lowest multiplier = %d\n", min_multiplier);
-
- /* Sanity checks */
- if (current_multiplier == 0 || max_multiplier == 0
- || min_multiplier == 0)
- return -EINVAL;
- if (current_multiplier > max_multiplier
- || max_multiplier <= min_multiplier)
- return -EINVAL;
- if (current_voltage > 0x1f || max_voltage > 0x1f)
- return -EINVAL;
- if (max_voltage < min_voltage)
- return -EINVAL;
-
- /* Calc FSB speed */
- fsb = cpu_khz / current_multiplier;
- /* Calc number of p-states supported */
- if (brand == EPS_BRAND_C7M)
- states = max_multiplier - min_multiplier + 1;
- else
- states = 2;
-
- /* Allocate private data and frequency table for current cpu */
- centaur = kzalloc(sizeof(struct eps_cpu_data)
- + (states + 1) * sizeof(struct cpufreq_frequency_table),
- GFP_KERNEL);
- if (!centaur)
- return -ENOMEM;
- eps_cpu[0] = centaur;
-
- /* Copy basic values */
- centaur->fsb = fsb;
-
- /* Fill frequency and MSR value table */
- f_table = &centaur->freq_table[0];
- if (brand != EPS_BRAND_C7M) {
- f_table[0].frequency = fsb * min_multiplier;
- f_table[0].index = (min_multiplier << 8) | min_voltage;
- f_table[1].frequency = fsb * max_multiplier;
- f_table[1].index = (max_multiplier << 8) | max_voltage;
- f_table[2].frequency = CPUFREQ_TABLE_END;
- } else {
- k = 0;
- step = ((max_voltage - min_voltage) * 256)
- / (max_multiplier - min_multiplier);
- for (i = min_multiplier; i <= max_multiplier; i++) {
- voltage = (k * step) / 256 + min_voltage;
- f_table[k].frequency = fsb * i;
- f_table[k].index = (i << 8) | voltage;
- k++;
- }
- f_table[k].frequency = CPUFREQ_TABLE_END;
- }
-
- policy->cpuinfo.transition_latency = 140000; /* 844mV -> 700mV in ns */
- policy->cur = fsb * current_multiplier;
-
- ret = cpufreq_frequency_table_cpuinfo(policy, &centaur->freq_table[0]);
- if (ret) {
- kfree(centaur);
- return ret;
- }
-
- cpufreq_frequency_table_get_attr(&centaur->freq_table[0], policy->cpu);
- return 0;
-}
-
-static int eps_cpu_exit(struct cpufreq_policy *policy)
-{
- unsigned int cpu = policy->cpu;
- struct eps_cpu_data *centaur;
- u32 lo, hi;
-
- if (eps_cpu[cpu] == NULL)
- return -ENODEV;
- centaur = eps_cpu[cpu];
-
- /* Get max frequency */
- rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
- /* Set max frequency */
- eps_set_state(centaur, cpu, hi & 0xffff);
- /* Bye */
- cpufreq_frequency_table_put_attr(policy->cpu);
- kfree(eps_cpu[cpu]);
- eps_cpu[cpu] = NULL;
- return 0;
-}
-
-static struct freq_attr *eps_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-static struct cpufreq_driver eps_driver = {
- .verify = eps_verify,
- .target = eps_target,
- .init = eps_cpu_init,
- .exit = eps_cpu_exit,
- .get = eps_get,
- .name = "e_powersaver",
- .owner = THIS_MODULE,
- .attr = eps_attr,
-};
-
-static int __init eps_init(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
-
- /* This driver will work only on Centaur C7 processors with
- * Enhanced SpeedStep/PowerSaver registers */
- if (c->x86_vendor != X86_VENDOR_CENTAUR
- || c->x86 != 6 || c->x86_model < 10)
- return -ENODEV;
- if (!cpu_has(c, X86_FEATURE_EST))
- return -ENODEV;
-
- if (cpufreq_register_driver(&eps_driver))
- return -EINVAL;
- return 0;
-}
-
-static void __exit eps_exit(void)
-{
- cpufreq_unregister_driver(&eps_driver);
-}
-
-MODULE_AUTHOR("Rafal Bilski <rafalbilski@interia.pl>");
-MODULE_DESCRIPTION("Enhanced PowerSaver driver for VIA C7 CPU's.");
-MODULE_LICENSE("GPL");
-
-module_init(eps_init);
-module_exit(eps_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
deleted file mode 100644
index c587db472a7..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ /dev/null
@@ -1,309 +0,0 @@
-/*
- * elanfreq: cpufreq driver for the AMD ELAN family
- *
- * (c) Copyright 2002 Robert Schwebel <r.schwebel@pengutronix.de>
- *
- * Parts of this code are (c) Sven Geggus <sven@geggus.net>
- *
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * 2002-02-13: - initial revision for 2.4.18-pre9 by Robert Schwebel
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-
-#include <linux/delay.h>
-#include <linux/cpufreq.h>
-
-#include <asm/msr.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-
-#define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */
-#define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */
-
-/* Module parameter */
-static int max_freq;
-
-struct s_elan_multiplier {
- int clock; /* frequency in kHz */
- int val40h; /* PMU Force Mode register */
- int val80h; /* CPU Clock Speed Register */
-};
-
-/*
- * It is important that the frequencies
- * are listed in ascending order here!
- */
-static struct s_elan_multiplier elan_multiplier[] = {
- {1000, 0x02, 0x18},
- {2000, 0x02, 0x10},
- {4000, 0x02, 0x08},
- {8000, 0x00, 0x00},
- {16000, 0x00, 0x02},
- {33000, 0x00, 0x04},
- {66000, 0x01, 0x04},
- {99000, 0x01, 0x05}
-};
-
-static struct cpufreq_frequency_table elanfreq_table[] = {
- {0, 1000},
- {1, 2000},
- {2, 4000},
- {3, 8000},
- {4, 16000},
- {5, 33000},
- {6, 66000},
- {7, 99000},
- {0, CPUFREQ_TABLE_END},
-};
-
-
-/**
- * elanfreq_get_cpu_frequency: determine current cpu speed
- *
- * Finds out at which frequency the CPU of the Elan SOC runs
- * at the moment. Frequencies from 1 to 33 MHz are generated
- * the normal way, 66 and 99 MHz are called "Hyperspeed Mode"
- * and have the rest of the chip running with 33 MHz.
- */
-
-static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
-{
- u8 clockspeed_reg; /* Clock Speed Register */
-
- local_irq_disable();
- outb_p(0x80, REG_CSCIR);
- clockspeed_reg = inb_p(REG_CSCDR);
- local_irq_enable();
-
- if ((clockspeed_reg & 0xE0) == 0xE0)
- return 0;
-
- /* Are we in CPU clock multiplied mode (66/99 MHz)? */
- if ((clockspeed_reg & 0xE0) == 0xC0) {
- if ((clockspeed_reg & 0x01) == 0)
- return 66000;
- else
- return 99000;
- }
-
- /* 33 MHz is not 32 MHz... */
- if ((clockspeed_reg & 0xE0) == 0xA0)
- return 33000;
-
- return (1<<((clockspeed_reg & 0xE0) >> 5)) * 1000;
-}
-
-
-/**
- * elanfreq_set_cpu_frequency: Change the CPU core frequency
- * @cpu: cpu number
- * @freq: frequency in kHz
- *
- * This function takes a frequency value and changes the CPU frequency
- * according to this. Note that the frequency has to be checked by
- * elanfreq_validatespeed() for correctness!
- *
- * There is no return value.
- */
-
-static void elanfreq_set_cpu_state(unsigned int state)
-{
- struct cpufreq_freqs freqs;
-
- freqs.old = elanfreq_get_cpu_frequency(0);
- freqs.new = elan_multiplier[state].clock;
- freqs.cpu = 0; /* elanfreq.c is UP only driver */
-
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
- printk(KERN_INFO "elanfreq: attempting to set frequency to %i kHz\n",
- elan_multiplier[state].clock);
-
-
- /*
- * Access to the Elan's internal registers is indexed via
- * 0x22: Chip Setup & Control Register Index Register (CSCI)
- * 0x23: Chip Setup & Control Register Data Register (CSCD)
- *
- */
-
- /*
- * 0x40 is the Power Management Unit's Force Mode Register.
- * Bit 6 enables Hyperspeed Mode (66/100 MHz core frequency)
- */
-
- local_irq_disable();
- outb_p(0x40, REG_CSCIR); /* Disable hyperspeed mode */
- outb_p(0x00, REG_CSCDR);
- local_irq_enable(); /* wait till internal pipelines and */
- udelay(1000); /* buffers have cleaned up */
-
- local_irq_disable();
-
- /* now, set the CPU clock speed register (0x80) */
- outb_p(0x80, REG_CSCIR);
- outb_p(elan_multiplier[state].val80h, REG_CSCDR);
-
- /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */
- outb_p(0x40, REG_CSCIR);
- outb_p(elan_multiplier[state].val40h, REG_CSCDR);
- udelay(10000);
- local_irq_enable();
-
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-};
-
-
-/**
- * elanfreq_validatespeed: test if frequency range is valid
- * @policy: the policy to validate
- *
- * This function checks if a given frequency range in kHz is valid
- * for the hardware supported by the driver.
- */
-
-static int elanfreq_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]);
-}
-
-static int elanfreq_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- unsigned int newstate = 0;
-
- if (cpufreq_frequency_table_target(policy, &elanfreq_table[0],
- target_freq, relation, &newstate))
- return -EINVAL;
-
- elanfreq_set_cpu_state(newstate);
-
- return 0;
-}
-
-
-/*
- * Module init and exit code
- */
-
-static int elanfreq_cpu_init(struct cpufreq_policy *policy)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
- unsigned int i;
- int result;
-
- /* capability check */
- if ((c->x86_vendor != X86_VENDOR_AMD) ||
- (c->x86 != 4) || (c->x86_model != 10))
- return -ENODEV;
-
- /* max freq */
- if (!max_freq)
- max_freq = elanfreq_get_cpu_frequency(0);
-
- /* table init */
- for (i = 0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) {
- if (elanfreq_table[i].frequency > max_freq)
- elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID;
- }
-
- /* cpuinfo and default policy values */
- policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
- policy->cur = elanfreq_get_cpu_frequency(0);
-
- result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table);
- if (result)
- return result;
-
- cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu);
- return 0;
-}
-
-
-static int elanfreq_cpu_exit(struct cpufreq_policy *policy)
-{
- cpufreq_frequency_table_put_attr(policy->cpu);
- return 0;
-}
-
-
-#ifndef MODULE
-/**
- * elanfreq_setup - elanfreq command line parameter parsing
- *
- * elanfreq command line parameter. Use:
- * elanfreq=66000
- * to set the maximum CPU frequency to 66 MHz. Note that in
- * case you do not give this boot parameter, the maximum
- * frequency will fall back to _current_ CPU frequency which
- * might be lower. If you build this as a module, use the
- * max_freq module parameter instead.
- */
-static int __init elanfreq_setup(char *str)
-{
- max_freq = simple_strtoul(str, &str, 0);
- printk(KERN_WARNING "You're using the deprecated elanfreq command line option. Use elanfreq.max_freq instead, please!\n");
- return 1;
-}
-__setup("elanfreq=", elanfreq_setup);
-#endif
-
-
-static struct freq_attr *elanfreq_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-
-static struct cpufreq_driver elanfreq_driver = {
- .get = elanfreq_get_cpu_frequency,
- .verify = elanfreq_verify,
- .target = elanfreq_target,
- .init = elanfreq_cpu_init,
- .exit = elanfreq_cpu_exit,
- .name = "elanfreq",
- .owner = THIS_MODULE,
- .attr = elanfreq_attr,
-};
-
-
-static int __init elanfreq_init(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
-
- /* Test if we have the right hardware */
- if ((c->x86_vendor != X86_VENDOR_AMD) ||
- (c->x86 != 4) || (c->x86_model != 10)) {
- printk(KERN_INFO "elanfreq: error: no Elan processor found!\n");
- return -ENODEV;
- }
- return cpufreq_register_driver(&elanfreq_driver);
-}
-
-
-static void __exit elanfreq_exit(void)
-{
- cpufreq_unregister_driver(&elanfreq_driver);
-}
-
-
-module_param(max_freq, int, 0444);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, "
- "Sven Geggus <sven@geggus.net>");
-MODULE_DESCRIPTION("cpufreq driver for AMD's Elan CPUs");
-
-module_init(elanfreq_init);
-module_exit(elanfreq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
deleted file mode 100644
index 32974cf8423..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
+++ /dev/null
@@ -1,517 +0,0 @@
-/*
- * Cyrix MediaGX and NatSemi Geode Suspend Modulation
- * (C) 2002 Zwane Mwaikambo <zwane@commfireservices.com>
- * (C) 2002 Hiroshi Miura <miura@da-cha.org>
- * All Rights Reserved
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation
- *
- * The author(s) of this software shall not be held liable for damages
- * of any nature resulting due to the use of this software. This
- * software is provided AS-IS with no warranties.
- *
- * Theoretical note:
- *
- * (see Geode(tm) CS5530 manual (rev.4.1) page.56)
- *
- * CPU frequency control on NatSemi Geode GX1/GXLV processor and CS55x0
- * are based on Suspend Modulation.
- *
- * Suspend Modulation works by asserting and de-asserting the SUSP# pin
- * to CPU(GX1/GXLV) for configurable durations. When asserting SUSP#
- * the CPU enters an idle state. GX1 stops its core clock when SUSP# is
- * asserted then power consumption is reduced.
- *
- * Suspend Modulation's OFF/ON duration are configurable
- * with 'Suspend Modulation OFF Count Register'
- * and 'Suspend Modulation ON Count Register'.
- * These registers are 8bit counters that represent the number of
- * 32us intervals which the SUSP# pin is asserted(ON)/de-asserted(OFF)
- * to the processor.
- *
- * These counters define a ratio which is the effective frequency
- * of operation of the system.
- *
- * OFF Count
- * F_eff = Fgx * ----------------------
- * OFF Count + ON Count
- *
- * 0 <= On Count, Off Count <= 255
- *
- * From these limits, we can get register values
- *
- * off_duration + on_duration <= MAX_DURATION
- * on_duration = off_duration * (stock_freq - freq) / freq
- *
- * off_duration = (freq * DURATION) / stock_freq
- * on_duration = DURATION - off_duration
- *
- *
- *---------------------------------------------------------------------------
- *
- * ChangeLog:
- * Dec. 12, 2003 Hiroshi Miura <miura@da-cha.org>
- * - fix on/off register mistake
- * - fix cpu_khz calc when it stops cpu modulation.
- *
- * Dec. 11, 2002 Hiroshi Miura <miura@da-cha.org>
- * - rewrite for Cyrix MediaGX Cx5510/5520 and
- * NatSemi Geode Cs5530(A).
- *
- * Jul. ??, 2002 Zwane Mwaikambo <zwane@commfireservices.com>
- * - cs5530_mod patch for 2.4.19-rc1.
- *
- *---------------------------------------------------------------------------
- *
- * Todo
- * Test on machines with 5510, 5530, 5530A
- */
-
-/************************************************************************
- * Suspend Modulation - Definitions *
- ************************************************************************/
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/cpufreq.h>
-#include <linux/pci.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-
-#include <asm/processor-cyrix.h>
-
-/* PCI config registers, all at F0 */
-#define PCI_PMER1 0x80 /* power management enable register 1 */
-#define PCI_PMER2 0x81 /* power management enable register 2 */
-#define PCI_PMER3 0x82 /* power management enable register 3 */
-#define PCI_IRQTC 0x8c /* irq speedup timer counter register:typical 2 to 4ms */
-#define PCI_VIDTC 0x8d /* video speedup timer counter register: typical 50 to 100ms */
-#define PCI_MODOFF 0x94 /* suspend modulation OFF counter register, 1 = 32us */
-#define PCI_MODON 0x95 /* suspend modulation ON counter register */
-#define PCI_SUSCFG 0x96 /* suspend configuration register */
-
-/* PMER1 bits */
-#define GPM (1<<0) /* global power management */
-#define GIT (1<<1) /* globally enable PM device idle timers */
-#define GTR (1<<2) /* globally enable IO traps */
-#define IRQ_SPDUP (1<<3) /* disable clock throttle during interrupt handling */
-#define VID_SPDUP (1<<4) /* disable clock throttle during vga video handling */
-
-/* SUSCFG bits */
-#define SUSMOD (1<<0) /* enable/disable suspend modulation */
-/* the below is supported only with cs5530 (after rev.1.2)/cs5530A */
-#define SMISPDUP (1<<1) /* select how SMI re-enable suspend modulation: */
- /* IRQTC timer or read SMI speedup disable reg.(F1BAR[08-09h]) */
-#define SUSCFG (1<<2) /* enable powering down a GXLV processor. "Special 3Volt Suspend" mode */
-/* the below is supported only with cs5530A */
-#define PWRSVE_ISA (1<<3) /* stop ISA clock */
-#define PWRSVE (1<<4) /* active idle */
-
-struct gxfreq_params {
- u8 on_duration;
- u8 off_duration;
- u8 pci_suscfg;
- u8 pci_pmer1;
- u8 pci_pmer2;
- struct pci_dev *cs55x0;
-};
-
-static struct gxfreq_params *gx_params;
-static int stock_freq;
-
-/* PCI bus clock - defaults to 30.000 if cpu_khz is not available */
-static int pci_busclk;
-module_param(pci_busclk, int, 0444);
-
-/* maximum duration for which the cpu may be suspended
- * (32us * MAX_DURATION). If no parameter is given, this defaults
- * to 255.
- * Note that this leads to a maximum of 8 ms(!) where the CPU clock
- * is suspended -- processing power is just 0.39% of what it used to be,
- * though. 781.25 kHz(!) for a 200 MHz processor -- wow. */
-static int max_duration = 255;
-module_param(max_duration, int, 0444);
-
-/* For the default policy, we want at least some processing power
- * - let's say 5%. (min = maxfreq / POLICY_MIN_DIV)
- */
-#define POLICY_MIN_DIV 20
-
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "gx-suspmod", msg)
-
-/**
- * we can detect a core multipiler from dir0_lsb
- * from GX1 datasheet p.56,
- * MULT[3:0]:
- * 0000 = SYSCLK multiplied by 4 (test only)
- * 0001 = SYSCLK multiplied by 10
- * 0010 = SYSCLK multiplied by 4
- * 0011 = SYSCLK multiplied by 6
- * 0100 = SYSCLK multiplied by 9
- * 0101 = SYSCLK multiplied by 5
- * 0110 = SYSCLK multiplied by 7
- * 0111 = SYSCLK multiplied by 8
- * of 33.3MHz
- **/
-static int gx_freq_mult[16] = {
- 4, 10, 4, 6, 9, 5, 7, 8,
- 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-
-/****************************************************************
- * Low Level chipset interface *
- ****************************************************************/
-static struct pci_device_id gx_chipset_tbl[] __initdata = {
- { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY), },
- { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5520), },
- { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5510), },
- { 0, },
-};
-
-static void gx_write_byte(int reg, int value)
-{
- pci_write_config_byte(gx_params->cs55x0, reg, value);
-}
-
-/**
- * gx_detect_chipset:
- *
- **/
-static __init struct pci_dev *gx_detect_chipset(void)
-{
- struct pci_dev *gx_pci = NULL;
-
- /* check if CPU is a MediaGX or a Geode. */
- if ((boot_cpu_data.x86_vendor != X86_VENDOR_NSC) &&
- (boot_cpu_data.x86_vendor != X86_VENDOR_CYRIX)) {
- dprintk("error: no MediaGX/Geode processor found!\n");
- return NULL;
- }
-
- /* detect which companion chip is used */
- for_each_pci_dev(gx_pci) {
- if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL)
- return gx_pci;
- }
-
- dprintk("error: no supported chipset found!\n");
- return NULL;
-}
-
-/**
- * gx_get_cpuspeed:
- *
- * Finds out at which efficient frequency the Cyrix MediaGX/NatSemi
- * Geode CPU runs.
- */
-static unsigned int gx_get_cpuspeed(unsigned int cpu)
-{
- if ((gx_params->pci_suscfg & SUSMOD) == 0)
- return stock_freq;
-
- return (stock_freq * gx_params->off_duration)
- / (gx_params->on_duration + gx_params->off_duration);
-}
-
-/**
- * gx_validate_speed:
- * determine current cpu speed
- *
- **/
-
-static unsigned int gx_validate_speed(unsigned int khz, u8 *on_duration,
- u8 *off_duration)
-{
- unsigned int i;
- u8 tmp_on, tmp_off;
- int old_tmp_freq = stock_freq;
- int tmp_freq;
-
- *off_duration = 1;
- *on_duration = 0;
-
- for (i = max_duration; i > 0; i--) {
- tmp_off = ((khz * i) / stock_freq) & 0xff;
- tmp_on = i - tmp_off;
- tmp_freq = (stock_freq * tmp_off) / i;
- /* if this relation is closer to khz, use this. If it's equal,
- * prefer it, too - lower latency */
- if (abs(tmp_freq - khz) <= abs(old_tmp_freq - khz)) {
- *on_duration = tmp_on;
- *off_duration = tmp_off;
- old_tmp_freq = tmp_freq;
- }
- }
-
- return old_tmp_freq;
-}
-
-
-/**
- * gx_set_cpuspeed:
- * set cpu speed in khz.
- **/
-
-static void gx_set_cpuspeed(unsigned int khz)
-{
- u8 suscfg, pmer1;
- unsigned int new_khz;
- unsigned long flags;
- struct cpufreq_freqs freqs;
-
- freqs.cpu = 0;
- freqs.old = gx_get_cpuspeed(0);
-
- new_khz = gx_validate_speed(khz, &gx_params->on_duration,
- &gx_params->off_duration);
-
- freqs.new = new_khz;
-
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
- local_irq_save(flags);
-
-
-
- if (new_khz != stock_freq) {
- /* if new khz == 100% of CPU speed, it is special case */
- switch (gx_params->cs55x0->device) {
- case PCI_DEVICE_ID_CYRIX_5530_LEGACY:
- pmer1 = gx_params->pci_pmer1 | IRQ_SPDUP | VID_SPDUP;
- /* FIXME: need to test other values -- Zwane,Miura */
- /* typical 2 to 4ms */
- gx_write_byte(PCI_IRQTC, 4);
- /* typical 50 to 100ms */
- gx_write_byte(PCI_VIDTC, 100);
- gx_write_byte(PCI_PMER1, pmer1);
-
- if (gx_params->cs55x0->revision < 0x10) {
- /* CS5530(rev 1.2, 1.3) */
- suscfg = gx_params->pci_suscfg|SUSMOD;
- } else {
- /* CS5530A,B.. */
- suscfg = gx_params->pci_suscfg|SUSMOD|PWRSVE;
- }
- break;
- case PCI_DEVICE_ID_CYRIX_5520:
- case PCI_DEVICE_ID_CYRIX_5510:
- suscfg = gx_params->pci_suscfg | SUSMOD;
- break;
- default:
- local_irq_restore(flags);
- dprintk("fatal: try to set unknown chipset.\n");
- return;
- }
- } else {
- suscfg = gx_params->pci_suscfg & ~(SUSMOD);
- gx_params->off_duration = 0;
- gx_params->on_duration = 0;
- dprintk("suspend modulation disabled: cpu runs 100%% speed.\n");
- }
-
- gx_write_byte(PCI_MODOFF, gx_params->off_duration);
- gx_write_byte(PCI_MODON, gx_params->on_duration);
-
- gx_write_byte(PCI_SUSCFG, suscfg);
- pci_read_config_byte(gx_params->cs55x0, PCI_SUSCFG, &suscfg);
-
- local_irq_restore(flags);
-
- gx_params->pci_suscfg = suscfg;
-
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-
- dprintk("suspend modulation w/ duration of ON:%d us, OFF:%d us\n",
- gx_params->on_duration * 32, gx_params->off_duration * 32);
- dprintk("suspend modulation w/ clock speed: %d kHz.\n", freqs.new);
-}
-
-/****************************************************************
- * High level functions *
- ****************************************************************/
-
-/*
- * cpufreq_gx_verify: test if frequency range is valid
- *
- * This function checks if a given frequency range in kHz is valid
- * for the hardware supported by the driver.
- */
-
-static int cpufreq_gx_verify(struct cpufreq_policy *policy)
-{
- unsigned int tmp_freq = 0;
- u8 tmp1, tmp2;
-
- if (!stock_freq || !policy)
- return -EINVAL;
-
- policy->cpu = 0;
- cpufreq_verify_within_limits(policy, (stock_freq / max_duration),
- stock_freq);
-
- /* it needs to be assured that at least one supported frequency is
- * within policy->min and policy->max. If it is not, policy->max
- * needs to be increased until one freuqency is supported.
- * policy->min may not be decreased, though. This way we guarantee a
- * specific processing capacity.
- */
- tmp_freq = gx_validate_speed(policy->min, &tmp1, &tmp2);
- if (tmp_freq < policy->min)
- tmp_freq += stock_freq / max_duration;
- policy->min = tmp_freq;
- if (policy->min > policy->max)
- policy->max = tmp_freq;
- tmp_freq = gx_validate_speed(policy->max, &tmp1, &tmp2);
- if (tmp_freq > policy->max)
- tmp_freq -= stock_freq / max_duration;
- policy->max = tmp_freq;
- if (policy->max < policy->min)
- policy->max = policy->min;
- cpufreq_verify_within_limits(policy, (stock_freq / max_duration),
- stock_freq);
-
- return 0;
-}
-
-/*
- * cpufreq_gx_target:
- *
- */
-static int cpufreq_gx_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- u8 tmp1, tmp2;
- unsigned int tmp_freq;
-
- if (!stock_freq || !policy)
- return -EINVAL;
-
- policy->cpu = 0;
-
- tmp_freq = gx_validate_speed(target_freq, &tmp1, &tmp2);
- while (tmp_freq < policy->min) {
- tmp_freq += stock_freq / max_duration;
- tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2);
- }
- while (tmp_freq > policy->max) {
- tmp_freq -= stock_freq / max_duration;
- tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2);
- }
-
- gx_set_cpuspeed(tmp_freq);
-
- return 0;
-}
-
-static int cpufreq_gx_cpu_init(struct cpufreq_policy *policy)
-{
- unsigned int maxfreq, curfreq;
-
- if (!policy || policy->cpu != 0)
- return -ENODEV;
-
- /* determine maximum frequency */
- if (pci_busclk)
- maxfreq = pci_busclk * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
- else if (cpu_khz)
- maxfreq = cpu_khz;
- else
- maxfreq = 30000 * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
-
- stock_freq = maxfreq;
- curfreq = gx_get_cpuspeed(0);
-
- dprintk("cpu max frequency is %d.\n", maxfreq);
- dprintk("cpu current frequency is %dkHz.\n", curfreq);
-
- /* setup basic struct for cpufreq API */
- policy->cpu = 0;
-
- if (max_duration < POLICY_MIN_DIV)
- policy->min = maxfreq / max_duration;
- else
- policy->min = maxfreq / POLICY_MIN_DIV;
- policy->max = maxfreq;
- policy->cur = curfreq;
- policy->cpuinfo.min_freq = maxfreq / max_duration;
- policy->cpuinfo.max_freq = maxfreq;
- policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
-
- return 0;
-}
-
-/*
- * cpufreq_gx_init:
- * MediaGX/Geode GX initialize cpufreq driver
- */
-static struct cpufreq_driver gx_suspmod_driver = {
- .get = gx_get_cpuspeed,
- .verify = cpufreq_gx_verify,
- .target = cpufreq_gx_target,
- .init = cpufreq_gx_cpu_init,
- .name = "gx-suspmod",
- .owner = THIS_MODULE,
-};
-
-static int __init cpufreq_gx_init(void)
-{
- int ret;
- struct gxfreq_params *params;
- struct pci_dev *gx_pci;
-
- /* Test if we have the right hardware */
- gx_pci = gx_detect_chipset();
- if (gx_pci == NULL)
- return -ENODEV;
-
- /* check whether module parameters are sane */
- if (max_duration > 0xff)
- max_duration = 0xff;
-
- dprintk("geode suspend modulation available.\n");
-
- params = kzalloc(sizeof(struct gxfreq_params), GFP_KERNEL);
- if (params == NULL)
- return -ENOMEM;
-
- params->cs55x0 = gx_pci;
- gx_params = params;
-
- /* keep cs55x0 configurations */
- pci_read_config_byte(params->cs55x0, PCI_SUSCFG, &(params->pci_suscfg));
- pci_read_config_byte(params->cs55x0, PCI_PMER1, &(params->pci_pmer1));
- pci_read_config_byte(params->cs55x0, PCI_PMER2, &(params->pci_pmer2));
- pci_read_config_byte(params->cs55x0, PCI_MODON, &(params->on_duration));
- pci_read_config_byte(params->cs55x0, PCI_MODOFF,
- &(params->off_duration));
-
- ret = cpufreq_register_driver(&gx_suspmod_driver);
- if (ret) {
- kfree(params);
- return ret; /* register error! */
- }
-
- return 0;
-}
-
-static void __exit cpufreq_gx_exit(void)
-{
- cpufreq_unregister_driver(&gx_suspmod_driver);
- pci_dev_put(gx_params->cs55x0);
- kfree(gx_params);
-}
-
-MODULE_AUTHOR("Hiroshi Miura <miura@da-cha.org>");
-MODULE_DESCRIPTION("Cpufreq driver for Cyrix MediaGX and NatSemi Geode");
-MODULE_LICENSE("GPL");
-
-module_init(cpufreq_gx_init);
-module_exit(cpufreq_gx_exit);
-
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
deleted file mode 100644
index cf48cdd6907..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ /dev/null
@@ -1,1029 +0,0 @@
-/*
- * (C) 2001-2004 Dave Jones. <davej@redhat.com>
- * (C) 2002 Padraig Brady. <padraig@antefacto.com>
- *
- * Licensed under the terms of the GNU GPL License version 2.
- * Based upon datasheets & sample CPUs kindly provided by VIA.
- *
- * VIA have currently 3 different versions of Longhaul.
- * Version 1 (Longhaul) uses the BCR2 MSR at 0x1147.
- * It is present only in Samuel 1 (C5A), Samuel 2 (C5B) stepping 0.
- * Version 2 of longhaul is backward compatible with v1, but adds
- * LONGHAUL MSR for purpose of both frequency and voltage scaling.
- * Present in Samuel 2 (steppings 1-7 only) (C5B), and Ezra (C5C).
- * Version 3 of longhaul got renamed to Powersaver and redesigned
- * to use only the POWERSAVER MSR at 0x110a.
- * It is present in Ezra-T (C5M), Nehemiah (C5X) and above.
- * It's pretty much the same feature wise to longhaul v2, though
- * there is provision for scaling FSB too, but this doesn't work
- * too well in practice so we don't even try to use this.
- *
- * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/pci.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/delay.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-#include <linux/acpi.h>
-
-#include <asm/msr.h>
-#include <acpi/processor.h>
-
-#include "longhaul.h"
-
-#define PFX "longhaul: "
-
-#define TYPE_LONGHAUL_V1 1
-#define TYPE_LONGHAUL_V2 2
-#define TYPE_POWERSAVER 3
-
-#define CPU_SAMUEL 1
-#define CPU_SAMUEL2 2
-#define CPU_EZRA 3
-#define CPU_EZRA_T 4
-#define CPU_NEHEMIAH 5
-#define CPU_NEHEMIAH_C 6
-
-/* Flags */
-#define USE_ACPI_C3 (1 << 1)
-#define USE_NORTHBRIDGE (1 << 2)
-
-static int cpu_model;
-static unsigned int numscales = 16;
-static unsigned int fsb;
-
-static const struct mV_pos *vrm_mV_table;
-static const unsigned char *mV_vrm_table;
-
-static unsigned int highest_speed, lowest_speed; /* kHz */
-static unsigned int minmult, maxmult;
-static int can_scale_voltage;
-static struct acpi_processor *pr;
-static struct acpi_processor_cx *cx;
-static u32 acpi_regs_addr;
-static u8 longhaul_flags;
-static unsigned int longhaul_index;
-
-/* Module parameters */
-static int scale_voltage;
-static int disable_acpi_c3;
-static int revid_errata;
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "longhaul", msg)
-
-
-/* Clock ratios multiplied by 10 */
-static int mults[32];
-static int eblcr[32];
-static int longhaul_version;
-static struct cpufreq_frequency_table *longhaul_table;
-
-#ifdef CONFIG_CPU_FREQ_DEBUG
-static char speedbuffer[8];
-
-static char *print_speed(int speed)
-{
- if (speed < 1000) {
- snprintf(speedbuffer, sizeof(speedbuffer), "%dMHz", speed);
- return speedbuffer;
- }
-
- if (speed%1000 == 0)
- snprintf(speedbuffer, sizeof(speedbuffer),
- "%dGHz", speed/1000);
- else
- snprintf(speedbuffer, sizeof(speedbuffer),
- "%d.%dGHz", speed/1000, (speed%1000)/100);
-
- return speedbuffer;
-}
-#endif
-
-
-static unsigned int calc_speed(int mult)
-{
- int khz;
- khz = (mult/10)*fsb;
- if (mult%10)
- khz += fsb/2;
- khz *= 1000;
- return khz;
-}
-
-
-static int longhaul_get_cpu_mult(void)
-{
- unsigned long invalue = 0, lo, hi;
-
- rdmsr(MSR_IA32_EBL_CR_POWERON, lo, hi);
- invalue = (lo & (1<<22|1<<23|1<<24|1<<25))>>22;
- if (longhaul_version == TYPE_LONGHAUL_V2 ||
- longhaul_version == TYPE_POWERSAVER) {
- if (lo & (1<<27))
- invalue += 16;
- }
- return eblcr[invalue];
-}
-
-/* For processor with BCR2 MSR */
-
-static void do_longhaul1(unsigned int mults_index)
-{
- union msr_bcr2 bcr2;
-
- rdmsrl(MSR_VIA_BCR2, bcr2.val);
- /* Enable software clock multiplier */
- bcr2.bits.ESOFTBF = 1;
- bcr2.bits.CLOCKMUL = mults_index & 0xff;
-
- /* Sync to timer tick */
- safe_halt();
- /* Change frequency on next halt or sleep */
- wrmsrl(MSR_VIA_BCR2, bcr2.val);
- /* Invoke transition */
- ACPI_FLUSH_CPU_CACHE();
- halt();
-
- /* Disable software clock multiplier */
- local_irq_disable();
- rdmsrl(MSR_VIA_BCR2, bcr2.val);
- bcr2.bits.ESOFTBF = 0;
- wrmsrl(MSR_VIA_BCR2, bcr2.val);
-}
-
-/* For processor with Longhaul MSR */
-
-static void do_powersaver(int cx_address, unsigned int mults_index,
- unsigned int dir)
-{
- union msr_longhaul longhaul;
- u32 t;
-
- rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
- /* Setup new frequency */
- if (!revid_errata)
- longhaul.bits.RevisionKey = longhaul.bits.RevisionID;
- else
- longhaul.bits.RevisionKey = 0;
- longhaul.bits.SoftBusRatio = mults_index & 0xf;
- longhaul.bits.SoftBusRatio4 = (mults_index & 0x10) >> 4;
- /* Setup new voltage */
- if (can_scale_voltage)
- longhaul.bits.SoftVID = (mults_index >> 8) & 0x1f;
- /* Sync to timer tick */
- safe_halt();
- /* Raise voltage if necessary */
- if (can_scale_voltage && dir) {
- longhaul.bits.EnableSoftVID = 1;
- wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
- /* Change voltage */
- if (!cx_address) {
- ACPI_FLUSH_CPU_CACHE();
- halt();
- } else {
- ACPI_FLUSH_CPU_CACHE();
- /* Invoke C3 */
- inb(cx_address);
- /* Dummy op - must do something useless after P_LVL3
- * read */
- t = inl(acpi_gbl_FADT.xpm_timer_block.address);
- }
- longhaul.bits.EnableSoftVID = 0;
- wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
- }
-
- /* Change frequency on next halt or sleep */
- longhaul.bits.EnableSoftBusRatio = 1;
- wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
- if (!cx_address) {
- ACPI_FLUSH_CPU_CACHE();
- halt();
- } else {
- ACPI_FLUSH_CPU_CACHE();
- /* Invoke C3 */
- inb(cx_address);
- /* Dummy op - must do something useless after P_LVL3 read */
- t = inl(acpi_gbl_FADT.xpm_timer_block.address);
- }
- /* Disable bus ratio bit */
- longhaul.bits.EnableSoftBusRatio = 0;
- wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
-
- /* Reduce voltage if necessary */
- if (can_scale_voltage && !dir) {
- longhaul.bits.EnableSoftVID = 1;
- wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
- /* Change voltage */
- if (!cx_address) {
- ACPI_FLUSH_CPU_CACHE();
- halt();
- } else {
- ACPI_FLUSH_CPU_CACHE();
- /* Invoke C3 */
- inb(cx_address);
- /* Dummy op - must do something useless after P_LVL3
- * read */
- t = inl(acpi_gbl_FADT.xpm_timer_block.address);
- }
- longhaul.bits.EnableSoftVID = 0;
- wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
- }
-}
-
-/**
- * longhaul_set_cpu_frequency()
- * @mults_index : bitpattern of the new multiplier.
- *
- * Sets a new clock ratio.
- */
-
-static void longhaul_setstate(unsigned int table_index)
-{
- unsigned int mults_index;
- int speed, mult;
- struct cpufreq_freqs freqs;
- unsigned long flags;
- unsigned int pic1_mask, pic2_mask;
- u16 bm_status = 0;
- u32 bm_timeout = 1000;
- unsigned int dir = 0;
-
- mults_index = longhaul_table[table_index].index;
- /* Safety precautions */
- mult = mults[mults_index & 0x1f];
- if (mult == -1)
- return;
- speed = calc_speed(mult);
- if ((speed > highest_speed) || (speed < lowest_speed))
- return;
- /* Voltage transition before frequency transition? */
- if (can_scale_voltage && longhaul_index < table_index)
- dir = 1;
-
- freqs.old = calc_speed(longhaul_get_cpu_mult());
- freqs.new = speed;
- freqs.cpu = 0; /* longhaul.c is UP only driver */
-
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
- dprintk("Setting to FSB:%dMHz Mult:%d.%dx (%s)\n",
- fsb, mult/10, mult%10, print_speed(speed/1000));
-retry_loop:
- preempt_disable();
- local_irq_save(flags);
-
- pic2_mask = inb(0xA1);
- pic1_mask = inb(0x21); /* works on C3. save mask. */
- outb(0xFF, 0xA1); /* Overkill */
- outb(0xFE, 0x21); /* TMR0 only */
-
- /* Wait while PCI bus is busy. */
- if (acpi_regs_addr && (longhaul_flags & USE_NORTHBRIDGE
- || ((pr != NULL) && pr->flags.bm_control))) {
- bm_status = inw(acpi_regs_addr);
- bm_status &= 1 << 4;
- while (bm_status && bm_timeout) {
- outw(1 << 4, acpi_regs_addr);
- bm_timeout--;
- bm_status = inw(acpi_regs_addr);
- bm_status &= 1 << 4;
- }
- }
-
- if (longhaul_flags & USE_NORTHBRIDGE) {
- /* Disable AGP and PCI arbiters */
- outb(3, 0x22);
- } else if ((pr != NULL) && pr->flags.bm_control) {
- /* Disable bus master arbitration */
- acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 1);
- }
- switch (longhaul_version) {
-
- /*
- * Longhaul v1. (Samuel[C5A] and Samuel2 stepping 0[C5B])
- * Software controlled multipliers only.
- */
- case TYPE_LONGHAUL_V1:
- do_longhaul1(mults_index);
- break;
-
- /*
- * Longhaul v2 appears in Samuel2 Steppings 1->7 [C5B] and Ezra [C5C]
- *
- * Longhaul v3 (aka Powersaver). (Ezra-T [C5M] & Nehemiah [C5N])
- * Nehemiah can do FSB scaling too, but this has never been proven
- * to work in practice.
- */
- case TYPE_LONGHAUL_V2:
- case TYPE_POWERSAVER:
- if (longhaul_flags & USE_ACPI_C3) {
- /* Don't allow wakeup */
- acpi_write_bit_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
- do_powersaver(cx->address, mults_index, dir);
- } else {
- do_powersaver(0, mults_index, dir);
- }
- break;
- }
-
- if (longhaul_flags & USE_NORTHBRIDGE) {
- /* Enable arbiters */
- outb(0, 0x22);
- } else if ((pr != NULL) && pr->flags.bm_control) {
- /* Enable bus master arbitration */
- acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 0);
- }
- outb(pic2_mask, 0xA1); /* restore mask */
- outb(pic1_mask, 0x21);
-
- local_irq_restore(flags);
- preempt_enable();
-
- freqs.new = calc_speed(longhaul_get_cpu_mult());
- /* Check if requested frequency is set. */
- if (unlikely(freqs.new != speed)) {
- printk(KERN_INFO PFX "Failed to set requested frequency!\n");
- /* Revision ID = 1 but processor is expecting revision key
- * equal to 0. Jumpers at the bottom of processor will change
- * multiplier and FSB, but will not change bits in Longhaul
- * MSR nor enable voltage scaling. */
- if (!revid_errata) {
- printk(KERN_INFO PFX "Enabling \"Ignore Revision ID\" "
- "option.\n");
- revid_errata = 1;
- msleep(200);
- goto retry_loop;
- }
- /* Why ACPI C3 sometimes doesn't work is a mystery for me.
- * But it does happen. Processor is entering ACPI C3 state,
- * but it doesn't change frequency. I tried poking various
- * bits in northbridge registers, but without success. */
- if (longhaul_flags & USE_ACPI_C3) {
- printk(KERN_INFO PFX "Disabling ACPI C3 support.\n");
- longhaul_flags &= ~USE_ACPI_C3;
- if (revid_errata) {
- printk(KERN_INFO PFX "Disabling \"Ignore "
- "Revision ID\" option.\n");
- revid_errata = 0;
- }
- msleep(200);
- goto retry_loop;
- }
- /* This shouldn't happen. Longhaul ver. 2 was reported not
- * working on processors without voltage scaling, but with
- * RevID = 1. RevID errata will make things right. Just
- * to be 100% sure. */
- if (longhaul_version == TYPE_LONGHAUL_V2) {
- printk(KERN_INFO PFX "Switching to Longhaul ver. 1\n");
- longhaul_version = TYPE_LONGHAUL_V1;
- msleep(200);
- goto retry_loop;
- }
- }
- /* Report true CPU frequency */
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-
- if (!bm_timeout)
- printk(KERN_INFO PFX "Warning: Timeout while waiting for "
- "idle PCI bus.\n");
-}
-
-/*
- * Centaur decided to make life a little more tricky.
- * Only longhaul v1 is allowed to read EBLCR BSEL[0:1].
- * Samuel2 and above have to try and guess what the FSB is.
- * We do this by assuming we booted at maximum multiplier, and interpolate
- * between that value multiplied by possible FSBs and cpu_mhz which
- * was calculated at boot time. Really ugly, but no other way to do this.
- */
-
-#define ROUNDING 0xf
-
-static int guess_fsb(int mult)
-{
- int speed = cpu_khz / 1000;
- int i;
- int speeds[] = { 666, 1000, 1333, 2000 };
- int f_max, f_min;
-
- for (i = 0; i < 4; i++) {
- f_max = ((speeds[i] * mult) + 50) / 100;
- f_max += (ROUNDING / 2);
- f_min = f_max - ROUNDING;
- if ((speed <= f_max) && (speed >= f_min))
- return speeds[i] / 10;
- }
- return 0;
-}
-
-
-static int __cpuinit longhaul_get_ranges(void)
-{
- unsigned int i, j, k = 0;
- unsigned int ratio;
- int mult;
-
- /* Get current frequency */
- mult = longhaul_get_cpu_mult();
- if (mult == -1) {
- printk(KERN_INFO PFX "Invalid (reserved) multiplier!\n");
- return -EINVAL;
- }
- fsb = guess_fsb(mult);
- if (fsb == 0) {
- printk(KERN_INFO PFX "Invalid (reserved) FSB!\n");
- return -EINVAL;
- }
- /* Get max multiplier - as we always did.
- * Longhaul MSR is useful only when voltage scaling is enabled.
- * C3 is booting at max anyway. */
- maxmult = mult;
- /* Get min multiplier */
- switch (cpu_model) {
- case CPU_NEHEMIAH:
- minmult = 50;
- break;
- case CPU_NEHEMIAH_C:
- minmult = 40;
- break;
- default:
- minmult = 30;
- break;
- }
-
- dprintk("MinMult:%d.%dx MaxMult:%d.%dx\n",
- minmult/10, minmult%10, maxmult/10, maxmult%10);
-
- highest_speed = calc_speed(maxmult);
- lowest_speed = calc_speed(minmult);
- dprintk("FSB:%dMHz Lowest speed: %s Highest speed:%s\n", fsb,
- print_speed(lowest_speed/1000),
- print_speed(highest_speed/1000));
-
- if (lowest_speed == highest_speed) {
- printk(KERN_INFO PFX "highestspeed == lowest, aborting.\n");
- return -EINVAL;
- }
- if (lowest_speed > highest_speed) {
- printk(KERN_INFO PFX "nonsense! lowest (%d > %d) !\n",
- lowest_speed, highest_speed);
- return -EINVAL;
- }
-
- longhaul_table = kmalloc((numscales + 1) * sizeof(*longhaul_table),
- GFP_KERNEL);
- if (!longhaul_table)
- return -ENOMEM;
-
- for (j = 0; j < numscales; j++) {
- ratio = mults[j];
- if (ratio == -1)
- continue;
- if (ratio > maxmult || ratio < minmult)
- continue;
- longhaul_table[k].frequency = calc_speed(ratio);
- longhaul_table[k].index = j;
- k++;
- }
- if (k <= 1) {
- kfree(longhaul_table);
- return -ENODEV;
- }
- /* Sort */
- for (j = 0; j < k - 1; j++) {
- unsigned int min_f, min_i;
- min_f = longhaul_table[j].frequency;
- min_i = j;
- for (i = j + 1; i < k; i++) {
- if (longhaul_table[i].frequency < min_f) {
- min_f = longhaul_table[i].frequency;
- min_i = i;
- }
- }
- if (min_i != j) {
- swap(longhaul_table[j].frequency,
- longhaul_table[min_i].frequency);
- swap(longhaul_table[j].index,
- longhaul_table[min_i].index);
- }
- }
-
- longhaul_table[k].frequency = CPUFREQ_TABLE_END;
-
- /* Find index we are running on */
- for (j = 0; j < k; j++) {
- if (mults[longhaul_table[j].index & 0x1f] == mult) {
- longhaul_index = j;
- break;
- }
- }
- return 0;
-}
-
-
-static void __cpuinit longhaul_setup_voltagescaling(void)
-{
- union msr_longhaul longhaul;
- struct mV_pos minvid, maxvid, vid;
- unsigned int j, speed, pos, kHz_step, numvscales;
- int min_vid_speed;
-
- rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
- if (!(longhaul.bits.RevisionID & 1)) {
- printk(KERN_INFO PFX "Voltage scaling not supported by CPU.\n");
- return;
- }
-
- if (!longhaul.bits.VRMRev) {
- printk(KERN_INFO PFX "VRM 8.5\n");
- vrm_mV_table = &vrm85_mV[0];
- mV_vrm_table = &mV_vrm85[0];
- } else {
- printk(KERN_INFO PFX "Mobile VRM\n");
- if (cpu_model < CPU_NEHEMIAH)
- return;
- vrm_mV_table = &mobilevrm_mV[0];
- mV_vrm_table = &mV_mobilevrm[0];
- }
-
- minvid = vrm_mV_table[longhaul.bits.MinimumVID];
- maxvid = vrm_mV_table[longhaul.bits.MaximumVID];
-
- if (minvid.mV == 0 || maxvid.mV == 0 || minvid.mV > maxvid.mV) {
- printk(KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. "
- "Voltage scaling disabled.\n",
- minvid.mV/1000, minvid.mV%1000,
- maxvid.mV/1000, maxvid.mV%1000);
- return;
- }
-
- if (minvid.mV == maxvid.mV) {
- printk(KERN_INFO PFX "Claims to support voltage scaling but "
- "min & max are both %d.%03d. "
- "Voltage scaling disabled\n",
- maxvid.mV/1000, maxvid.mV%1000);
- return;
- }
-
- /* How many voltage steps*/
- numvscales = maxvid.pos - minvid.pos + 1;
- printk(KERN_INFO PFX
- "Max VID=%d.%03d "
- "Min VID=%d.%03d, "
- "%d possible voltage scales\n",
- maxvid.mV/1000, maxvid.mV%1000,
- minvid.mV/1000, minvid.mV%1000,
- numvscales);
-
- /* Calculate max frequency at min voltage */
- j = longhaul.bits.MinMHzBR;
- if (longhaul.bits.MinMHzBR4)
- j += 16;
- min_vid_speed = eblcr[j];
- if (min_vid_speed == -1)
- return;
- switch (longhaul.bits.MinMHzFSB) {
- case 0:
- min_vid_speed *= 13333;
- break;
- case 1:
- min_vid_speed *= 10000;
- break;
- case 3:
- min_vid_speed *= 6666;
- break;
- default:
- return;
- break;
- }
- if (min_vid_speed >= highest_speed)
- return;
- /* Calculate kHz for one voltage step */
- kHz_step = (highest_speed - min_vid_speed) / numvscales;
-
- j = 0;
- while (longhaul_table[j].frequency != CPUFREQ_TABLE_END) {
- speed = longhaul_table[j].frequency;
- if (speed > min_vid_speed)
- pos = (speed - min_vid_speed) / kHz_step + minvid.pos;
- else
- pos = minvid.pos;
- longhaul_table[j].index |= mV_vrm_table[pos] << 8;
- vid = vrm_mV_table[mV_vrm_table[pos]];
- printk(KERN_INFO PFX "f: %d kHz, index: %d, vid: %d mV\n",
- speed, j, vid.mV);
- j++;
- }
-
- can_scale_voltage = 1;
- printk(KERN_INFO PFX "Voltage scaling enabled.\n");
-}
-
-
-static int longhaul_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy, longhaul_table);
-}
-
-
-static int longhaul_target(struct cpufreq_policy *policy,
- unsigned int target_freq, unsigned int relation)
-{
- unsigned int table_index = 0;
- unsigned int i;
- unsigned int dir = 0;
- u8 vid, current_vid;
-
- if (cpufreq_frequency_table_target(policy, longhaul_table, target_freq,
- relation, &table_index))
- return -EINVAL;
-
- /* Don't set same frequency again */
- if (longhaul_index == table_index)
- return 0;
-
- if (!can_scale_voltage)
- longhaul_setstate(table_index);
- else {
- /* On test system voltage transitions exceeding single
- * step up or down were turning motherboard off. Both
- * "ondemand" and "userspace" are unsafe. C7 is doing
- * this in hardware, C3 is old and we need to do this
- * in software. */
- i = longhaul_index;
- current_vid = (longhaul_table[longhaul_index].index >> 8);
- current_vid &= 0x1f;
- if (table_index > longhaul_index)
- dir = 1;
- while (i != table_index) {
- vid = (longhaul_table[i].index >> 8) & 0x1f;
- if (vid != current_vid) {
- longhaul_setstate(i);
- current_vid = vid;
- msleep(200);
- }
- if (dir)
- i++;
- else
- i--;
- }
- longhaul_setstate(table_index);
- }
- longhaul_index = table_index;
- return 0;
-}
-
-
-static unsigned int longhaul_get(unsigned int cpu)
-{
- if (cpu)
- return 0;
- return calc_speed(longhaul_get_cpu_mult());
-}
-
-static acpi_status longhaul_walk_callback(acpi_handle obj_handle,
- u32 nesting_level,
- void *context, void **return_value)
-{
- struct acpi_device *d;
-
- if (acpi_bus_get_device(obj_handle, &d))
- return 0;
-
- *return_value = acpi_driver_data(d);
- return 1;
-}
-
-/* VIA don't support PM2 reg, but have something similar */
-static int enable_arbiter_disable(void)
-{
- struct pci_dev *dev;
- int status = 1;
- int reg;
- u8 pci_cmd;
-
- /* Find PLE133 host bridge */
- reg = 0x78;
- dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8601_0,
- NULL);
- /* Find PM133/VT8605 host bridge */
- if (dev == NULL)
- dev = pci_get_device(PCI_VENDOR_ID_VIA,
- PCI_DEVICE_ID_VIA_8605_0, NULL);
- /* Find CLE266 host bridge */
- if (dev == NULL) {
- reg = 0x76;
- dev = pci_get_device(PCI_VENDOR_ID_VIA,
- PCI_DEVICE_ID_VIA_862X_0, NULL);
- /* Find CN400 V-Link host bridge */
- if (dev == NULL)
- dev = pci_get_device(PCI_VENDOR_ID_VIA, 0x7259, NULL);
- }
- if (dev != NULL) {
- /* Enable access to port 0x22 */
- pci_read_config_byte(dev, reg, &pci_cmd);
- if (!(pci_cmd & 1<<7)) {
- pci_cmd |= 1<<7;
- pci_write_config_byte(dev, reg, pci_cmd);
- pci_read_config_byte(dev, reg, &pci_cmd);
- if (!(pci_cmd & 1<<7)) {
- printk(KERN_ERR PFX
- "Can't enable access to port 0x22.\n");
- status = 0;
- }
- }
- pci_dev_put(dev);
- return status;
- }
- return 0;
-}
-
-static int longhaul_setup_southbridge(void)
-{
- struct pci_dev *dev;
- u8 pci_cmd;
-
- /* Find VT8235 southbridge */
- dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, NULL);
- if (dev == NULL)
- /* Find VT8237 southbridge */
- dev = pci_get_device(PCI_VENDOR_ID_VIA,
- PCI_DEVICE_ID_VIA_8237, NULL);
- if (dev != NULL) {
- /* Set transition time to max */
- pci_read_config_byte(dev, 0xec, &pci_cmd);
- pci_cmd &= ~(1 << 2);
- pci_write_config_byte(dev, 0xec, pci_cmd);
- pci_read_config_byte(dev, 0xe4, &pci_cmd);
- pci_cmd &= ~(1 << 7);
- pci_write_config_byte(dev, 0xe4, pci_cmd);
- pci_read_config_byte(dev, 0xe5, &pci_cmd);
- pci_cmd |= 1 << 7;
- pci_write_config_byte(dev, 0xe5, pci_cmd);
- /* Get address of ACPI registers block*/
- pci_read_config_byte(dev, 0x81, &pci_cmd);
- if (pci_cmd & 1 << 7) {
- pci_read_config_dword(dev, 0x88, &acpi_regs_addr);
- acpi_regs_addr &= 0xff00;
- printk(KERN_INFO PFX "ACPI I/O at 0x%x\n",
- acpi_regs_addr);
- }
-
- pci_dev_put(dev);
- return 1;
- }
- return 0;
-}
-
-static int __cpuinit longhaul_cpu_init(struct cpufreq_policy *policy)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
- char *cpuname = NULL;
- int ret;
- u32 lo, hi;
-
- /* Check what we have on this motherboard */
- switch (c->x86_model) {
- case 6:
- cpu_model = CPU_SAMUEL;
- cpuname = "C3 'Samuel' [C5A]";
- longhaul_version = TYPE_LONGHAUL_V1;
- memcpy(mults, samuel1_mults, sizeof(samuel1_mults));
- memcpy(eblcr, samuel1_eblcr, sizeof(samuel1_eblcr));
- break;
-
- case 7:
- switch (c->x86_mask) {
- case 0:
- longhaul_version = TYPE_LONGHAUL_V1;
- cpu_model = CPU_SAMUEL2;
- cpuname = "C3 'Samuel 2' [C5B]";
- /* Note, this is not a typo, early Samuel2's had
- * Samuel1 ratios. */
- memcpy(mults, samuel1_mults, sizeof(samuel1_mults));
- memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr));
- break;
- case 1 ... 15:
- longhaul_version = TYPE_LONGHAUL_V2;
- if (c->x86_mask < 8) {
- cpu_model = CPU_SAMUEL2;
- cpuname = "C3 'Samuel 2' [C5B]";
- } else {
- cpu_model = CPU_EZRA;
- cpuname = "C3 'Ezra' [C5C]";
- }
- memcpy(mults, ezra_mults, sizeof(ezra_mults));
- memcpy(eblcr, ezra_eblcr, sizeof(ezra_eblcr));
- break;
- }
- break;
-
- case 8:
- cpu_model = CPU_EZRA_T;
- cpuname = "C3 'Ezra-T' [C5M]";
- longhaul_version = TYPE_POWERSAVER;
- numscales = 32;
- memcpy(mults, ezrat_mults, sizeof(ezrat_mults));
- memcpy(eblcr, ezrat_eblcr, sizeof(ezrat_eblcr));
- break;
-
- case 9:
- longhaul_version = TYPE_POWERSAVER;
- numscales = 32;
- memcpy(mults, nehemiah_mults, sizeof(nehemiah_mults));
- memcpy(eblcr, nehemiah_eblcr, sizeof(nehemiah_eblcr));
- switch (c->x86_mask) {
- case 0 ... 1:
- cpu_model = CPU_NEHEMIAH;
- cpuname = "C3 'Nehemiah A' [C5XLOE]";
- break;
- case 2 ... 4:
- cpu_model = CPU_NEHEMIAH;
- cpuname = "C3 'Nehemiah B' [C5XLOH]";
- break;
- case 5 ... 15:
- cpu_model = CPU_NEHEMIAH_C;
- cpuname = "C3 'Nehemiah C' [C5P]";
- break;
- }
- break;
-
- default:
- cpuname = "Unknown";
- break;
- }
- /* Check Longhaul ver. 2 */
- if (longhaul_version == TYPE_LONGHAUL_V2) {
- rdmsr(MSR_VIA_LONGHAUL, lo, hi);
- if (lo == 0 && hi == 0)
- /* Looks like MSR isn't present */
- longhaul_version = TYPE_LONGHAUL_V1;
- }
-
- printk(KERN_INFO PFX "VIA %s CPU detected. ", cpuname);
- switch (longhaul_version) {
- case TYPE_LONGHAUL_V1:
- case TYPE_LONGHAUL_V2:
- printk(KERN_CONT "Longhaul v%d supported.\n", longhaul_version);
- break;
- case TYPE_POWERSAVER:
- printk(KERN_CONT "Powersaver supported.\n");
- break;
- };
-
- /* Doesn't hurt */
- longhaul_setup_southbridge();
-
- /* Find ACPI data for processor */
- acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
- ACPI_UINT32_MAX, &longhaul_walk_callback, NULL,
- NULL, (void *)&pr);
-
- /* Check ACPI support for C3 state */
- if (pr != NULL && longhaul_version == TYPE_POWERSAVER) {
- cx = &pr->power.states[ACPI_STATE_C3];
- if (cx->address > 0 && cx->latency <= 1000)
- longhaul_flags |= USE_ACPI_C3;
- }
- /* Disable if it isn't working */
- if (disable_acpi_c3)
- longhaul_flags &= ~USE_ACPI_C3;
- /* Check if northbridge is friendly */
- if (enable_arbiter_disable())
- longhaul_flags |= USE_NORTHBRIDGE;
-
- /* Check ACPI support for bus master arbiter disable */
- if (!(longhaul_flags & USE_ACPI_C3
- || longhaul_flags & USE_NORTHBRIDGE)
- && ((pr == NULL) || !(pr->flags.bm_control))) {
- printk(KERN_ERR PFX
- "No ACPI support. Unsupported northbridge.\n");
- return -ENODEV;
- }
-
- if (longhaul_flags & USE_NORTHBRIDGE)
- printk(KERN_INFO PFX "Using northbridge support.\n");
- if (longhaul_flags & USE_ACPI_C3)
- printk(KERN_INFO PFX "Using ACPI support.\n");
-
- ret = longhaul_get_ranges();
- if (ret != 0)
- return ret;
-
- if ((longhaul_version != TYPE_LONGHAUL_V1) && (scale_voltage != 0))
- longhaul_setup_voltagescaling();
-
- policy->cpuinfo.transition_latency = 200000; /* nsec */
- policy->cur = calc_speed(longhaul_get_cpu_mult());
-
- ret = cpufreq_frequency_table_cpuinfo(policy, longhaul_table);
- if (ret)
- return ret;
-
- cpufreq_frequency_table_get_attr(longhaul_table, policy->cpu);
-
- return 0;
-}
-
-static int __devexit longhaul_cpu_exit(struct cpufreq_policy *policy)
-{
- cpufreq_frequency_table_put_attr(policy->cpu);
- return 0;
-}
-
-static struct freq_attr *longhaul_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-static struct cpufreq_driver longhaul_driver = {
- .verify = longhaul_verify,
- .target = longhaul_target,
- .get = longhaul_get,
- .init = longhaul_cpu_init,
- .exit = __devexit_p(longhaul_cpu_exit),
- .name = "longhaul",
- .owner = THIS_MODULE,
- .attr = longhaul_attr,
-};
-
-
-static int __init longhaul_init(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
-
- if (c->x86_vendor != X86_VENDOR_CENTAUR || c->x86 != 6)
- return -ENODEV;
-
-#ifdef CONFIG_SMP
- if (num_online_cpus() > 1) {
- printk(KERN_ERR PFX "More than 1 CPU detected, "
- "longhaul disabled.\n");
- return -ENODEV;
- }
-#endif
-#ifdef CONFIG_X86_IO_APIC
- if (cpu_has_apic) {
- printk(KERN_ERR PFX "APIC detected. Longhaul is currently "
- "broken in this configuration.\n");
- return -ENODEV;
- }
-#endif
- switch (c->x86_model) {
- case 6 ... 9:
- return cpufreq_register_driver(&longhaul_driver);
- case 10:
- printk(KERN_ERR PFX "Use acpi-cpufreq driver for VIA C7\n");
- default:
- ;
- }
-
- return -ENODEV;
-}
-
-
-static void __exit longhaul_exit(void)
-{
- int i;
-
- for (i = 0; i < numscales; i++) {
- if (mults[i] == maxmult) {
- longhaul_setstate(i);
- break;
- }
- }
-
- cpufreq_unregister_driver(&longhaul_driver);
- kfree(longhaul_table);
-}
-
-/* Even if BIOS is exporting ACPI C3 state, and it is used
- * with success when CPU is idle, this state doesn't
- * trigger frequency transition in some cases. */
-module_param(disable_acpi_c3, int, 0644);
-MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support");
-/* Change CPU voltage with frequency. Very useful to save
- * power, but most VIA C3 processors aren't supporting it. */
-module_param(scale_voltage, int, 0644);
-MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
-/* Force revision key to 0 for processors which doesn't
- * support voltage scaling, but are introducing itself as
- * such. */
-module_param(revid_errata, int, 0644);
-MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID");
-
-MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
-MODULE_DESCRIPTION("Longhaul driver for VIA Cyrix processors.");
-MODULE_LICENSE("GPL");
-
-late_initcall(longhaul_init);
-module_exit(longhaul_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h
deleted file mode 100644
index cbf48fbca88..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.h
+++ /dev/null
@@ -1,353 +0,0 @@
-/*
- * longhaul.h
- * (C) 2003 Dave Jones.
- *
- * Licensed under the terms of the GNU GPL License version 2.
- *
- * VIA-specific information
- */
-
-union msr_bcr2 {
- struct {
- unsigned Reseved:19, // 18:0
- ESOFTBF:1, // 19
- Reserved2:3, // 22:20
- CLOCKMUL:4, // 26:23
- Reserved3:5; // 31:27
- } bits;
- unsigned long val;
-};
-
-union msr_longhaul {
- struct {
- unsigned RevisionID:4, // 3:0
- RevisionKey:4, // 7:4
- EnableSoftBusRatio:1, // 8
- EnableSoftVID:1, // 9
- EnableSoftBSEL:1, // 10
- Reserved:3, // 11:13
- SoftBusRatio4:1, // 14
- VRMRev:1, // 15
- SoftBusRatio:4, // 19:16
- SoftVID:5, // 24:20
- Reserved2:3, // 27:25
- SoftBSEL:2, // 29:28
- Reserved3:2, // 31:30
- MaxMHzBR:4, // 35:32
- MaximumVID:5, // 40:36
- MaxMHzFSB:2, // 42:41
- MaxMHzBR4:1, // 43
- Reserved4:4, // 47:44
- MinMHzBR:4, // 51:48
- MinimumVID:5, // 56:52
- MinMHzFSB:2, // 58:57
- MinMHzBR4:1, // 59
- Reserved5:4; // 63:60
- } bits;
- unsigned long long val;
-};
-
-/*
- * Clock ratio tables. Div/Mod by 10 to get ratio.
- * The eblcr values specify the ratio read from the CPU.
- * The mults values specify what to write to the CPU.
- */
-
-/*
- * VIA C3 Samuel 1 & Samuel 2 (stepping 0)
- */
-static const int __cpuinitdata samuel1_mults[16] = {
- -1, /* 0000 -> RESERVED */
- 30, /* 0001 -> 3.0x */
- 40, /* 0010 -> 4.0x */
- -1, /* 0011 -> RESERVED */
- -1, /* 0100 -> RESERVED */
- 35, /* 0101 -> 3.5x */
- 45, /* 0110 -> 4.5x */
- 55, /* 0111 -> 5.5x */
- 60, /* 1000 -> 6.0x */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 50, /* 1011 -> 5.0x */
- 65, /* 1100 -> 6.5x */
- 75, /* 1101 -> 7.5x */
- -1, /* 1110 -> RESERVED */
- -1, /* 1111 -> RESERVED */
-};
-
-static const int __cpuinitdata samuel1_eblcr[16] = {
- 50, /* 0000 -> RESERVED */
- 30, /* 0001 -> 3.0x */
- 40, /* 0010 -> 4.0x */
- -1, /* 0011 -> RESERVED */
- 55, /* 0100 -> 5.5x */
- 35, /* 0101 -> 3.5x */
- 45, /* 0110 -> 4.5x */
- -1, /* 0111 -> RESERVED */
- -1, /* 1000 -> RESERVED */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 60, /* 1011 -> 6.0x */
- -1, /* 1100 -> RESERVED */
- 75, /* 1101 -> 7.5x */
- -1, /* 1110 -> RESERVED */
- 65, /* 1111 -> 6.5x */
-};
-
-/*
- * VIA C3 Samuel2 Stepping 1->15
- */
-static const int __cpuinitdata samuel2_eblcr[16] = {
- 50, /* 0000 -> 5.0x */
- 30, /* 0001 -> 3.0x */
- 40, /* 0010 -> 4.0x */
- 100, /* 0011 -> 10.0x */
- 55, /* 0100 -> 5.5x */
- 35, /* 0101 -> 3.5x */
- 45, /* 0110 -> 4.5x */
- 110, /* 0111 -> 11.0x */
- 90, /* 1000 -> 9.0x */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 60, /* 1011 -> 6.0x */
- 120, /* 1100 -> 12.0x */
- 75, /* 1101 -> 7.5x */
- 130, /* 1110 -> 13.0x */
- 65, /* 1111 -> 6.5x */
-};
-
-/*
- * VIA C3 Ezra
- */
-static const int __cpuinitdata ezra_mults[16] = {
- 100, /* 0000 -> 10.0x */
- 30, /* 0001 -> 3.0x */
- 40, /* 0010 -> 4.0x */
- 90, /* 0011 -> 9.0x */
- 95, /* 0100 -> 9.5x */
- 35, /* 0101 -> 3.5x */
- 45, /* 0110 -> 4.5x */
- 55, /* 0111 -> 5.5x */
- 60, /* 1000 -> 6.0x */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 50, /* 1011 -> 5.0x */
- 65, /* 1100 -> 6.5x */
- 75, /* 1101 -> 7.5x */
- 85, /* 1110 -> 8.5x */
- 120, /* 1111 -> 12.0x */
-};
-
-static const int __cpuinitdata ezra_eblcr[16] = {
- 50, /* 0000 -> 5.0x */
- 30, /* 0001 -> 3.0x */
- 40, /* 0010 -> 4.0x */
- 100, /* 0011 -> 10.0x */
- 55, /* 0100 -> 5.5x */
- 35, /* 0101 -> 3.5x */
- 45, /* 0110 -> 4.5x */
- 95, /* 0111 -> 9.5x */
- 90, /* 1000 -> 9.0x */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 60, /* 1011 -> 6.0x */
- 120, /* 1100 -> 12.0x */
- 75, /* 1101 -> 7.5x */
- 85, /* 1110 -> 8.5x */
- 65, /* 1111 -> 6.5x */
-};
-
-/*
- * VIA C3 (Ezra-T) [C5M].
- */
-static const int __cpuinitdata ezrat_mults[32] = {
- 100, /* 0000 -> 10.0x */
- 30, /* 0001 -> 3.0x */
- 40, /* 0010 -> 4.0x */
- 90, /* 0011 -> 9.0x */
- 95, /* 0100 -> 9.5x */
- 35, /* 0101 -> 3.5x */
- 45, /* 0110 -> 4.5x */
- 55, /* 0111 -> 5.5x */
- 60, /* 1000 -> 6.0x */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 50, /* 1011 -> 5.0x */
- 65, /* 1100 -> 6.5x */
- 75, /* 1101 -> 7.5x */
- 85, /* 1110 -> 8.5x */
- 120, /* 1111 -> 12.0x */
-
- -1, /* 0000 -> RESERVED (10.0x) */
- 110, /* 0001 -> 11.0x */
- -1, /* 0010 -> 12.0x */
- -1, /* 0011 -> RESERVED (9.0x)*/
- 105, /* 0100 -> 10.5x */
- 115, /* 0101 -> 11.5x */
- 125, /* 0110 -> 12.5x */
- 135, /* 0111 -> 13.5x */
- 140, /* 1000 -> 14.0x */
- 150, /* 1001 -> 15.0x */
- 160, /* 1010 -> 16.0x */
- 130, /* 1011 -> 13.0x */
- 145, /* 1100 -> 14.5x */
- 155, /* 1101 -> 15.5x */
- -1, /* 1110 -> RESERVED (13.0x) */
- -1, /* 1111 -> RESERVED (12.0x) */
-};
-
-static const int __cpuinitdata ezrat_eblcr[32] = {
- 50, /* 0000 -> 5.0x */
- 30, /* 0001 -> 3.0x */
- 40, /* 0010 -> 4.0x */
- 100, /* 0011 -> 10.0x */
- 55, /* 0100 -> 5.5x */
- 35, /* 0101 -> 3.5x */
- 45, /* 0110 -> 4.5x */
- 95, /* 0111 -> 9.5x */
- 90, /* 1000 -> 9.0x */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 60, /* 1011 -> 6.0x */
- 120, /* 1100 -> 12.0x */
- 75, /* 1101 -> 7.5x */
- 85, /* 1110 -> 8.5x */
- 65, /* 1111 -> 6.5x */
-
- -1, /* 0000 -> RESERVED (9.0x) */
- 110, /* 0001 -> 11.0x */
- 120, /* 0010 -> 12.0x */
- -1, /* 0011 -> RESERVED (10.0x)*/
- 135, /* 0100 -> 13.5x */
- 115, /* 0101 -> 11.5x */
- 125, /* 0110 -> 12.5x */
- 105, /* 0111 -> 10.5x */
- 130, /* 1000 -> 13.0x */
- 150, /* 1001 -> 15.0x */
- 160, /* 1010 -> 16.0x */
- 140, /* 1011 -> 14.0x */
- -1, /* 1100 -> RESERVED (12.0x) */
- 155, /* 1101 -> 15.5x */
- -1, /* 1110 -> RESERVED (13.0x) */
- 145, /* 1111 -> 14.5x */
-};
-
-/*
- * VIA C3 Nehemiah */
-
-static const int __cpuinitdata nehemiah_mults[32] = {
- 100, /* 0000 -> 10.0x */
- -1, /* 0001 -> 16.0x */
- 40, /* 0010 -> 4.0x */
- 90, /* 0011 -> 9.0x */
- 95, /* 0100 -> 9.5x */
- -1, /* 0101 -> RESERVED */
- 45, /* 0110 -> 4.5x */
- 55, /* 0111 -> 5.5x */
- 60, /* 1000 -> 6.0x */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 50, /* 1011 -> 5.0x */
- 65, /* 1100 -> 6.5x */
- 75, /* 1101 -> 7.5x */
- 85, /* 1110 -> 8.5x */
- 120, /* 1111 -> 12.0x */
- -1, /* 0000 -> 10.0x */
- 110, /* 0001 -> 11.0x */
- -1, /* 0010 -> 12.0x */
- -1, /* 0011 -> 9.0x */
- 105, /* 0100 -> 10.5x */
- 115, /* 0101 -> 11.5x */
- 125, /* 0110 -> 12.5x */
- 135, /* 0111 -> 13.5x */
- 140, /* 1000 -> 14.0x */
- 150, /* 1001 -> 15.0x */
- 160, /* 1010 -> 16.0x */
- 130, /* 1011 -> 13.0x */
- 145, /* 1100 -> 14.5x */
- 155, /* 1101 -> 15.5x */
- -1, /* 1110 -> RESERVED (13.0x) */
- -1, /* 1111 -> 12.0x */
-};
-
-static const int __cpuinitdata nehemiah_eblcr[32] = {
- 50, /* 0000 -> 5.0x */
- 160, /* 0001 -> 16.0x */
- 40, /* 0010 -> 4.0x */
- 100, /* 0011 -> 10.0x */
- 55, /* 0100 -> 5.5x */
- -1, /* 0101 -> RESERVED */
- 45, /* 0110 -> 4.5x */
- 95, /* 0111 -> 9.5x */
- 90, /* 1000 -> 9.0x */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 60, /* 1011 -> 6.0x */
- 120, /* 1100 -> 12.0x */
- 75, /* 1101 -> 7.5x */
- 85, /* 1110 -> 8.5x */
- 65, /* 1111 -> 6.5x */
- 90, /* 0000 -> 9.0x */
- 110, /* 0001 -> 11.0x */
- 120, /* 0010 -> 12.0x */
- 100, /* 0011 -> 10.0x */
- 135, /* 0100 -> 13.5x */
- 115, /* 0101 -> 11.5x */
- 125, /* 0110 -> 12.5x */
- 105, /* 0111 -> 10.5x */
- 130, /* 1000 -> 13.0x */
- 150, /* 1001 -> 15.0x */
- 160, /* 1010 -> 16.0x */
- 140, /* 1011 -> 14.0x */
- 120, /* 1100 -> 12.0x */
- 155, /* 1101 -> 15.5x */
- -1, /* 1110 -> RESERVED (13.0x) */
- 145 /* 1111 -> 14.5x */
-};
-
-/*
- * Voltage scales. Div/Mod by 1000 to get actual voltage.
- * Which scale to use depends on the VRM type in use.
- */
-
-struct mV_pos {
- unsigned short mV;
- unsigned short pos;
-};
-
-static const struct mV_pos __cpuinitdata vrm85_mV[32] = {
- {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2},
- {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26},
- {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18},
- {1450, 16}, {1400, 14}, {1350, 12}, {1300, 10},
- {1275, 9}, {1225, 7}, {1175, 5}, {1125, 3},
- {1075, 1}, {1825, 31}, {1775, 29}, {1725, 27},
- {1675, 25}, {1625, 23}, {1575, 21}, {1525, 19},
- {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11}
-};
-
-static const unsigned char __cpuinitdata mV_vrm85[32] = {
- 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11,
- 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d,
- 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19,
- 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15
-};
-
-static const struct mV_pos __cpuinitdata mobilevrm_mV[32] = {
- {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28},
- {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24},
- {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20},
- {1150, 19}, {1100, 18}, {1050, 17}, {1000, 16},
- {975, 15}, {950, 14}, {925, 13}, {900, 12},
- {875, 11}, {850, 10}, {825, 9}, {800, 8},
- {775, 7}, {750, 6}, {725, 5}, {700, 4},
- {675, 3}, {650, 2}, {625, 1}, {600, 0}
-};
-
-static const unsigned char __cpuinitdata mV_mobilevrm[32] = {
- 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18,
- 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10,
- 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08,
- 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
-};
-
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
deleted file mode 100644
index d9f51367666..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ /dev/null
@@ -1,327 +0,0 @@
-/*
- * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
- *
- * Licensed under the terms of the GNU GPL License version 2.
- *
- * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/timex.h>
-
-#include <asm/msr.h>
-#include <asm/processor.h>
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "longrun", msg)
-
-static struct cpufreq_driver longrun_driver;
-
-/**
- * longrun_{low,high}_freq is needed for the conversion of cpufreq kHz
- * values into per cent values. In TMTA microcode, the following is valid:
- * performance_pctg = (current_freq - low_freq)/(high_freq - low_freq)
- */
-static unsigned int longrun_low_freq, longrun_high_freq;
-
-
-/**
- * longrun_get_policy - get the current LongRun policy
- * @policy: struct cpufreq_policy where current policy is written into
- *
- * Reads the current LongRun policy by access to MSR_TMTA_LONGRUN_FLAGS
- * and MSR_TMTA_LONGRUN_CTRL
- */
-static void __cpuinit longrun_get_policy(struct cpufreq_policy *policy)
-{
- u32 msr_lo, msr_hi;
-
- rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
- dprintk("longrun flags are %x - %x\n", msr_lo, msr_hi);
- if (msr_lo & 0x01)
- policy->policy = CPUFREQ_POLICY_PERFORMANCE;
- else
- policy->policy = CPUFREQ_POLICY_POWERSAVE;
-
- rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
- dprintk("longrun ctrl is %x - %x\n", msr_lo, msr_hi);
- msr_lo &= 0x0000007F;
- msr_hi &= 0x0000007F;
-
- if (longrun_high_freq <= longrun_low_freq) {
- /* Assume degenerate Longrun table */
- policy->min = policy->max = longrun_high_freq;
- } else {
- policy->min = longrun_low_freq + msr_lo *
- ((longrun_high_freq - longrun_low_freq) / 100);
- policy->max = longrun_low_freq + msr_hi *
- ((longrun_high_freq - longrun_low_freq) / 100);
- }
- policy->cpu = 0;
-}
-
-
-/**
- * longrun_set_policy - sets a new CPUFreq policy
- * @policy: new policy
- *
- * Sets a new CPUFreq policy on LongRun-capable processors. This function
- * has to be called with cpufreq_driver locked.
- */
-static int longrun_set_policy(struct cpufreq_policy *policy)
-{
- u32 msr_lo, msr_hi;
- u32 pctg_lo, pctg_hi;
-
- if (!policy)
- return -EINVAL;
-
- if (longrun_high_freq <= longrun_low_freq) {
- /* Assume degenerate Longrun table */
- pctg_lo = pctg_hi = 100;
- } else {
- pctg_lo = (policy->min - longrun_low_freq) /
- ((longrun_high_freq - longrun_low_freq) / 100);
- pctg_hi = (policy->max - longrun_low_freq) /
- ((longrun_high_freq - longrun_low_freq) / 100);
- }
-
- if (pctg_hi > 100)
- pctg_hi = 100;
- if (pctg_lo > pctg_hi)
- pctg_lo = pctg_hi;
-
- /* performance or economy mode */
- rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
- msr_lo &= 0xFFFFFFFE;
- switch (policy->policy) {
- case CPUFREQ_POLICY_PERFORMANCE:
- msr_lo |= 0x00000001;
- break;
- case CPUFREQ_POLICY_POWERSAVE:
- break;
- }
- wrmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
-
- /* lower and upper boundary */
- rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
- msr_lo &= 0xFFFFFF80;
- msr_hi &= 0xFFFFFF80;
- msr_lo |= pctg_lo;
- msr_hi |= pctg_hi;
- wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
-
- return 0;
-}
-
-
-/**
- * longrun_verify_poliy - verifies a new CPUFreq policy
- * @policy: the policy to verify
- *
- * Validates a new CPUFreq policy. This function has to be called with
- * cpufreq_driver locked.
- */
-static int longrun_verify_policy(struct cpufreq_policy *policy)
-{
- if (!policy)
- return -EINVAL;
-
- policy->cpu = 0;
- cpufreq_verify_within_limits(policy,
- policy->cpuinfo.min_freq,
- policy->cpuinfo.max_freq);
-
- if ((policy->policy != CPUFREQ_POLICY_POWERSAVE) &&
- (policy->policy != CPUFREQ_POLICY_PERFORMANCE))
- return -EINVAL;
-
- return 0;
-}
-
-static unsigned int longrun_get(unsigned int cpu)
-{
- u32 eax, ebx, ecx, edx;
-
- if (cpu)
- return 0;
-
- cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
- dprintk("cpuid eax is %u\n", eax);
-
- return eax * 1000;
-}
-
-/**
- * longrun_determine_freqs - determines the lowest and highest possible core frequency
- * @low_freq: an int to put the lowest frequency into
- * @high_freq: an int to put the highest frequency into
- *
- * Determines the lowest and highest possible core frequencies on this CPU.
- * This is necessary to calculate the performance percentage according to
- * TMTA rules:
- * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq)
- */
-static int __cpuinit longrun_determine_freqs(unsigned int *low_freq,
- unsigned int *high_freq)
-{
- u32 msr_lo, msr_hi;
- u32 save_lo, save_hi;
- u32 eax, ebx, ecx, edx;
- u32 try_hi;
- struct cpuinfo_x86 *c = &cpu_data(0);
-
- if (!low_freq || !high_freq)
- return -EINVAL;
-
- if (cpu_has(c, X86_FEATURE_LRTI)) {
- /* if the LongRun Table Interface is present, the
- * detection is a bit easier:
- * For minimum frequency, read out the maximum
- * level (msr_hi), write that into "currently
- * selected level", and read out the frequency.
- * For maximum frequency, read out level zero.
- */
- /* minimum */
- rdmsr(MSR_TMTA_LRTI_READOUT, msr_lo, msr_hi);
- wrmsr(MSR_TMTA_LRTI_READOUT, msr_hi, msr_hi);
- rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi);
- *low_freq = msr_lo * 1000; /* to kHz */
-
- /* maximum */
- wrmsr(MSR_TMTA_LRTI_READOUT, 0, msr_hi);
- rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi);
- *high_freq = msr_lo * 1000; /* to kHz */
-
- dprintk("longrun table interface told %u - %u kHz\n",
- *low_freq, *high_freq);
-
- if (*low_freq > *high_freq)
- *low_freq = *high_freq;
- return 0;
- }
-
- /* set the upper border to the value determined during TSC init */
- *high_freq = (cpu_khz / 1000);
- *high_freq = *high_freq * 1000;
- dprintk("high frequency is %u kHz\n", *high_freq);
-
- /* get current borders */
- rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
- save_lo = msr_lo & 0x0000007F;
- save_hi = msr_hi & 0x0000007F;
-
- /* if current perf_pctg is larger than 90%, we need to decrease the
- * upper limit to make the calculation more accurate.
- */
- cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
- /* try decreasing in 10% steps, some processors react only
- * on some barrier values */
- for (try_hi = 80; try_hi > 0 && ecx > 90; try_hi -= 10) {
- /* set to 0 to try_hi perf_pctg */
- msr_lo &= 0xFFFFFF80;
- msr_hi &= 0xFFFFFF80;
- msr_hi |= try_hi;
- wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
-
- /* read out current core MHz and current perf_pctg */
- cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
-
- /* restore values */
- wrmsr(MSR_TMTA_LONGRUN_CTRL, save_lo, save_hi);
- }
- dprintk("percentage is %u %%, freq is %u MHz\n", ecx, eax);
-
- /* performance_pctg = (current_freq - low_freq)/(high_freq - low_freq)
- * eqals
- * low_freq * (1 - perf_pctg) = (cur_freq - high_freq * perf_pctg)
- *
- * high_freq * perf_pctg is stored tempoarily into "ebx".
- */
- ebx = (((cpu_khz / 1000) * ecx) / 100); /* to MHz */
-
- if ((ecx > 95) || (ecx == 0) || (eax < ebx))
- return -EIO;
-
- edx = ((eax - ebx) * 100) / (100 - ecx);
- *low_freq = edx * 1000; /* back to kHz */
-
- dprintk("low frequency is %u kHz\n", *low_freq);
-
- if (*low_freq > *high_freq)
- *low_freq = *high_freq;
-
- return 0;
-}
-
-
-static int __cpuinit longrun_cpu_init(struct cpufreq_policy *policy)
-{
- int result = 0;
-
- /* capability check */
- if (policy->cpu != 0)
- return -ENODEV;
-
- /* detect low and high frequency */
- result = longrun_determine_freqs(&longrun_low_freq, &longrun_high_freq);
- if (result)
- return result;
-
- /* cpuinfo and default policy values */
- policy->cpuinfo.min_freq = longrun_low_freq;
- policy->cpuinfo.max_freq = longrun_high_freq;
- policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
- longrun_get_policy(policy);
-
- return 0;
-}
-
-
-static struct cpufreq_driver longrun_driver = {
- .flags = CPUFREQ_CONST_LOOPS,
- .verify = longrun_verify_policy,
- .setpolicy = longrun_set_policy,
- .get = longrun_get,
- .init = longrun_cpu_init,
- .name = "longrun",
- .owner = THIS_MODULE,
-};
-
-
-/**
- * longrun_init - initializes the Transmeta Crusoe LongRun CPUFreq driver
- *
- * Initializes the LongRun support.
- */
-static int __init longrun_init(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
-
- if (c->x86_vendor != X86_VENDOR_TRANSMETA ||
- !cpu_has(c, X86_FEATURE_LONGRUN))
- return -ENODEV;
-
- return cpufreq_register_driver(&longrun_driver);
-}
-
-
-/**
- * longrun_exit - unregisters LongRun support
- */
-static void __exit longrun_exit(void)
-{
- cpufreq_unregister_driver(&longrun_driver);
-}
-
-
-MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION("LongRun driver for Transmeta Crusoe and "
- "Efficeon processors.");
-MODULE_LICENSE("GPL");
-
-module_init(longrun_init);
-module_exit(longrun_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.c b/arch/x86/kernel/cpu/cpufreq/mperf.c
deleted file mode 100644
index 911e193018a..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/mperf.c
+++ /dev/null
@@ -1,51 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/slab.h>
-
-#include "mperf.h"
-
-static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
-
-/* Called via smp_call_function_single(), on the target CPU */
-static void read_measured_perf_ctrs(void *_cur)
-{
- struct aperfmperf *am = _cur;
-
- get_aperfmperf(am);
-}
-
-/*
- * Return the measured active (C0) frequency on this CPU since last call
- * to this function.
- * Input: cpu number
- * Return: Average CPU frequency in terms of max frequency (zero on error)
- *
- * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
- * over a period of time, while CPU is in C0 state.
- * IA32_MPERF counts at the rate of max advertised frequency
- * IA32_APERF counts at the rate of actual CPU frequency
- * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
- * no meaning should be associated with absolute values of these MSRs.
- */
-unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
- unsigned int cpu)
-{
- struct aperfmperf perf;
- unsigned long ratio;
- unsigned int retval;
-
- if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
- return 0;
-
- ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
- per_cpu(acfreq_old_perf, cpu) = perf;
-
- retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
-
- return retval;
-}
-EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf);
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.h b/arch/x86/kernel/cpu/cpufreq/mperf.h
deleted file mode 100644
index 5dbf2950dc2..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/mperf.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/*
- * (c) 2010 Advanced Micro Devices, Inc.
- * Your use of this code is subject to the terms and conditions of the
- * GNU general public license version 2. See "COPYING" or
- * http://www.gnu.org/licenses/gpl.html
- */
-
-unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
- unsigned int cpu);
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
deleted file mode 100644
index 52c93648e49..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ /dev/null
@@ -1,331 +0,0 @@
-/*
- * Pentium 4/Xeon CPU on demand clock modulation/speed scaling
- * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
- * (C) 2002 Zwane Mwaikambo <zwane@commfireservices.com>
- * (C) 2002 Arjan van de Ven <arjanv@redhat.com>
- * (C) 2002 Tora T. Engstad
- * All Rights Reserved
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * The author(s) of this software shall not be held liable for damages
- * of any nature resulting due to the use of this software. This
- * software is provided AS-IS with no warranties.
- *
- * Date Errata Description
- * 20020525 N44, O17 12.5% or 25% DC causes lockup
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/cpufreq.h>
-#include <linux/cpumask.h>
-#include <linux/timex.h>
-
-#include <asm/processor.h>
-#include <asm/msr.h>
-#include <asm/timer.h>
-
-#include "speedstep-lib.h"
-
-#define PFX "p4-clockmod: "
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "p4-clockmod", msg)
-
-/*
- * Duty Cycle (3bits), note DC_DISABLE is not specified in
- * intel docs i just use it to mean disable
- */
-enum {
- DC_RESV, DC_DFLT, DC_25PT, DC_38PT, DC_50PT,
- DC_64PT, DC_75PT, DC_88PT, DC_DISABLE
-};
-
-#define DC_ENTRIES 8
-
-
-static int has_N44_O17_errata[NR_CPUS];
-static unsigned int stock_freq;
-static struct cpufreq_driver p4clockmod_driver;
-static unsigned int cpufreq_p4_get(unsigned int cpu);
-
-static int cpufreq_p4_setdc(unsigned int cpu, unsigned int newstate)
-{
- u32 l, h;
-
- if (!cpu_online(cpu) ||
- (newstate > DC_DISABLE) || (newstate == DC_RESV))
- return -EINVAL;
-
- rdmsr_on_cpu(cpu, MSR_IA32_THERM_STATUS, &l, &h);
-
- if (l & 0x01)
- dprintk("CPU#%d currently thermal throttled\n", cpu);
-
- if (has_N44_O17_errata[cpu] &&
- (newstate == DC_25PT || newstate == DC_DFLT))
- newstate = DC_38PT;
-
- rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h);
- if (newstate == DC_DISABLE) {
- dprintk("CPU#%d disabling modulation\n", cpu);
- wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l & ~(1<<4), h);
- } else {
- dprintk("CPU#%d setting duty cycle to %d%%\n",
- cpu, ((125 * newstate) / 10));
- /* bits 63 - 5 : reserved
- * bit 4 : enable/disable
- * bits 3-1 : duty cycle
- * bit 0 : reserved
- */
- l = (l & ~14);
- l = l | (1<<4) | ((newstate & 0x7)<<1);
- wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l, h);
- }
-
- return 0;
-}
-
-
-static struct cpufreq_frequency_table p4clockmod_table[] = {
- {DC_RESV, CPUFREQ_ENTRY_INVALID},
- {DC_DFLT, 0},
- {DC_25PT, 0},
- {DC_38PT, 0},
- {DC_50PT, 0},
- {DC_64PT, 0},
- {DC_75PT, 0},
- {DC_88PT, 0},
- {DC_DISABLE, 0},
- {DC_RESV, CPUFREQ_TABLE_END},
-};
-
-
-static int cpufreq_p4_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- unsigned int newstate = DC_RESV;
- struct cpufreq_freqs freqs;
- int i;
-
- if (cpufreq_frequency_table_target(policy, &p4clockmod_table[0],
- target_freq, relation, &newstate))
- return -EINVAL;
-
- freqs.old = cpufreq_p4_get(policy->cpu);
- freqs.new = stock_freq * p4clockmod_table[newstate].index / 8;
-
- if (freqs.new == freqs.old)
- return 0;
-
- /* notifiers */
- for_each_cpu(i, policy->cpus) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
- }
-
- /* run on each logical CPU,
- * see section 13.15.3 of IA32 Intel Architecture Software
- * Developer's Manual, Volume 3
- */
- for_each_cpu(i, policy->cpus)
- cpufreq_p4_setdc(i, p4clockmod_table[newstate].index);
-
- /* notifiers */
- for_each_cpu(i, policy->cpus) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- }
-
- return 0;
-}
-
-
-static int cpufreq_p4_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy, &p4clockmod_table[0]);
-}
-
-
-static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
-{
- if (c->x86 == 0x06) {
- if (cpu_has(c, X86_FEATURE_EST))
- printk_once(KERN_WARNING PFX "Warning: EST-capable "
- "CPU detected. The acpi-cpufreq module offers "
- "voltage scaling in addition to frequency "
- "scaling. You should use that instead of "
- "p4-clockmod, if possible.\n");
- switch (c->x86_model) {
- case 0x0E: /* Core */
- case 0x0F: /* Core Duo */
- case 0x16: /* Celeron Core */
- case 0x1C: /* Atom */
- p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
- return speedstep_get_frequency(SPEEDSTEP_CPU_PCORE);
- case 0x0D: /* Pentium M (Dothan) */
- p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
- /* fall through */
- case 0x09: /* Pentium M (Banias) */
- return speedstep_get_frequency(SPEEDSTEP_CPU_PM);
- }
- }
-
- if (c->x86 != 0xF)
- return 0;
-
- /* on P-4s, the TSC runs with constant frequency independent whether
- * throttling is active or not. */
- p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
-
- if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4M) {
- printk(KERN_WARNING PFX "Warning: Pentium 4-M detected. "
- "The speedstep-ich or acpi cpufreq modules offer "
- "voltage scaling in addition of frequency scaling. "
- "You should use either one instead of p4-clockmod, "
- "if possible.\n");
- return speedstep_get_frequency(SPEEDSTEP_CPU_P4M);
- }
-
- return speedstep_get_frequency(SPEEDSTEP_CPU_P4D);
-}
-
-
-
-static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy)
-{
- struct cpuinfo_x86 *c = &cpu_data(policy->cpu);
- int cpuid = 0;
- unsigned int i;
-
-#ifdef CONFIG_SMP
- cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
-#endif
-
- /* Errata workaround */
- cpuid = (c->x86 << 8) | (c->x86_model << 4) | c->x86_mask;
- switch (cpuid) {
- case 0x0f07:
- case 0x0f0a:
- case 0x0f11:
- case 0x0f12:
- has_N44_O17_errata[policy->cpu] = 1;
- dprintk("has errata -- disabling low frequencies\n");
- }
-
- if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4D &&
- c->x86_model < 2) {
- /* switch to maximum frequency and measure result */
- cpufreq_p4_setdc(policy->cpu, DC_DISABLE);
- recalibrate_cpu_khz();
- }
- /* get max frequency */
- stock_freq = cpufreq_p4_get_frequency(c);
- if (!stock_freq)
- return -EINVAL;
-
- /* table init */
- for (i = 1; (p4clockmod_table[i].frequency != CPUFREQ_TABLE_END); i++) {
- if ((i < 2) && (has_N44_O17_errata[policy->cpu]))
- p4clockmod_table[i].frequency = CPUFREQ_ENTRY_INVALID;
- else
- p4clockmod_table[i].frequency = (stock_freq * i)/8;
- }
- cpufreq_frequency_table_get_attr(p4clockmod_table, policy->cpu);
-
- /* cpuinfo and default policy values */
-
- /* the transition latency is set to be 1 higher than the maximum
- * transition latency of the ondemand governor */
- policy->cpuinfo.transition_latency = 10000001;
- policy->cur = stock_freq;
-
- return cpufreq_frequency_table_cpuinfo(policy, &p4clockmod_table[0]);
-}
-
-
-static int cpufreq_p4_cpu_exit(struct cpufreq_policy *policy)
-{
- cpufreq_frequency_table_put_attr(policy->cpu);
- return 0;
-}
-
-static unsigned int cpufreq_p4_get(unsigned int cpu)
-{
- u32 l, h;
-
- rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h);
-
- if (l & 0x10) {
- l = l >> 1;
- l &= 0x7;
- } else
- l = DC_DISABLE;
-
- if (l != DC_DISABLE)
- return stock_freq * l / 8;
-
- return stock_freq;
-}
-
-static struct freq_attr *p4clockmod_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-static struct cpufreq_driver p4clockmod_driver = {
- .verify = cpufreq_p4_verify,
- .target = cpufreq_p4_target,
- .init = cpufreq_p4_cpu_init,
- .exit = cpufreq_p4_cpu_exit,
- .get = cpufreq_p4_get,
- .name = "p4-clockmod",
- .owner = THIS_MODULE,
- .attr = p4clockmod_attr,
-};
-
-
-static int __init cpufreq_p4_init(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
- int ret;
-
- /*
- * THERM_CONTROL is architectural for IA32 now, so
- * we can rely on the capability checks
- */
- if (c->x86_vendor != X86_VENDOR_INTEL)
- return -ENODEV;
-
- if (!test_cpu_cap(c, X86_FEATURE_ACPI) ||
- !test_cpu_cap(c, X86_FEATURE_ACC))
- return -ENODEV;
-
- ret = cpufreq_register_driver(&p4clockmod_driver);
- if (!ret)
- printk(KERN_INFO PFX "P4/Xeon(TM) CPU On-Demand Clock "
- "Modulation available\n");
-
- return ret;
-}
-
-
-static void __exit cpufreq_p4_exit(void)
-{
- cpufreq_unregister_driver(&p4clockmod_driver);
-}
-
-
-MODULE_AUTHOR("Zwane Mwaikambo <zwane@commfireservices.com>");
-MODULE_DESCRIPTION("cpufreq driver for Pentium(TM) 4/Xeon(TM)");
-MODULE_LICENSE("GPL");
-
-late_initcall(cpufreq_p4_init);
-module_exit(cpufreq_p4_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
deleted file mode 100644
index 755a31e0f5b..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ /dev/null
@@ -1,624 +0,0 @@
-/*
- * pcc-cpufreq.c - Processor Clocking Control firmware cpufreq interface
- *
- * Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com>
- * Copyright (C) 2009 Hewlett-Packard Development Company, L.P.
- * Nagananda Chumbalkar <nagananda.chumbalkar@hp.com>
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON
- * INFRINGEMENT. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/sched.h>
-#include <linux/cpufreq.h>
-#include <linux/compiler.h>
-#include <linux/slab.h>
-
-#include <linux/acpi.h>
-#include <linux/io.h>
-#include <linux/spinlock.h>
-#include <linux/uaccess.h>
-
-#include <acpi/processor.h>
-
-#define PCC_VERSION "1.00.00"
-#define POLL_LOOPS 300
-
-#define CMD_COMPLETE 0x1
-#define CMD_GET_FREQ 0x0
-#define CMD_SET_FREQ 0x1
-
-#define BUF_SZ 4
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "pcc-cpufreq", msg)
-
-struct pcc_register_resource {
- u8 descriptor;
- u16 length;
- u8 space_id;
- u8 bit_width;
- u8 bit_offset;
- u8 access_size;
- u64 address;
-} __attribute__ ((packed));
-
-struct pcc_memory_resource {
- u8 descriptor;
- u16 length;
- u8 space_id;
- u8 resource_usage;
- u8 type_specific;
- u64 granularity;
- u64 minimum;
- u64 maximum;
- u64 translation_offset;
- u64 address_length;
-} __attribute__ ((packed));
-
-static struct cpufreq_driver pcc_cpufreq_driver;
-
-struct pcc_header {
- u32 signature;
- u16 length;
- u8 major;
- u8 minor;
- u32 features;
- u16 command;
- u16 status;
- u32 latency;
- u32 minimum_time;
- u32 maximum_time;
- u32 nominal;
- u32 throttled_frequency;
- u32 minimum_frequency;
-};
-
-static void __iomem *pcch_virt_addr;
-static struct pcc_header __iomem *pcch_hdr;
-
-static DEFINE_SPINLOCK(pcc_lock);
-
-static struct acpi_generic_address doorbell;
-
-static u64 doorbell_preserve;
-static u64 doorbell_write;
-
-static u8 OSC_UUID[16] = {0x63, 0x9B, 0x2C, 0x9F, 0x70, 0x91, 0x49, 0x1f,
- 0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46};
-
-struct pcc_cpu {
- u32 input_offset;
- u32 output_offset;
-};
-
-static struct pcc_cpu __percpu *pcc_cpu_info;
-
-static int pcc_cpufreq_verify(struct cpufreq_policy *policy)
-{
- cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq,
- policy->cpuinfo.max_freq);
- return 0;
-}
-
-static inline void pcc_cmd(void)
-{
- u64 doorbell_value;
- int i;
-
- acpi_read(&doorbell_value, &doorbell);
- acpi_write((doorbell_value & doorbell_preserve) | doorbell_write,
- &doorbell);
-
- for (i = 0; i < POLL_LOOPS; i++) {
- if (ioread16(&pcch_hdr->status) & CMD_COMPLETE)
- break;
- }
-}
-
-static inline void pcc_clear_mapping(void)
-{
- if (pcch_virt_addr)
- iounmap(pcch_virt_addr);
- pcch_virt_addr = NULL;
-}
-
-static unsigned int pcc_get_freq(unsigned int cpu)
-{
- struct pcc_cpu *pcc_cpu_data;
- unsigned int curr_freq;
- unsigned int freq_limit;
- u16 status;
- u32 input_buffer;
- u32 output_buffer;
-
- spin_lock(&pcc_lock);
-
- dprintk("get: get_freq for CPU %d\n", cpu);
- pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
-
- input_buffer = 0x1;
- iowrite32(input_buffer,
- (pcch_virt_addr + pcc_cpu_data->input_offset));
- iowrite16(CMD_GET_FREQ, &pcch_hdr->command);
-
- pcc_cmd();
-
- output_buffer =
- ioread32(pcch_virt_addr + pcc_cpu_data->output_offset);
-
- /* Clear the input buffer - we are done with the current command */
- memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
-
- status = ioread16(&pcch_hdr->status);
- if (status != CMD_COMPLETE) {
- dprintk("get: FAILED: for CPU %d, status is %d\n",
- cpu, status);
- goto cmd_incomplete;
- }
- iowrite16(0, &pcch_hdr->status);
- curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff))
- / 100) * 1000);
-
- dprintk("get: SUCCESS: (virtual) output_offset for cpu %d is "
- "0x%x, contains a value of: 0x%x. Speed is: %d MHz\n",
- cpu, (pcch_virt_addr + pcc_cpu_data->output_offset),
- output_buffer, curr_freq);
-
- freq_limit = (output_buffer >> 8) & 0xff;
- if (freq_limit != 0xff) {
- dprintk("get: frequency for cpu %d is being temporarily"
- " capped at %d\n", cpu, curr_freq);
- }
-
- spin_unlock(&pcc_lock);
- return curr_freq;
-
-cmd_incomplete:
- iowrite16(0, &pcch_hdr->status);
- spin_unlock(&pcc_lock);
- return 0;
-}
-
-static int pcc_cpufreq_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- struct pcc_cpu *pcc_cpu_data;
- struct cpufreq_freqs freqs;
- u16 status;
- u32 input_buffer;
- int cpu;
-
- spin_lock(&pcc_lock);
- cpu = policy->cpu;
- pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
-
- dprintk("target: CPU %d should go to target freq: %d "
- "(virtual) input_offset is 0x%x\n",
- cpu, target_freq,
- (pcch_virt_addr + pcc_cpu_data->input_offset));
-
- freqs.new = target_freq;
- freqs.cpu = cpu;
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
- input_buffer = 0x1 | (((target_freq * 100)
- / (ioread32(&pcch_hdr->nominal) * 1000)) << 8);
- iowrite32(input_buffer,
- (pcch_virt_addr + pcc_cpu_data->input_offset));
- iowrite16(CMD_SET_FREQ, &pcch_hdr->command);
-
- pcc_cmd();
-
- /* Clear the input buffer - we are done with the current command */
- memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
-
- status = ioread16(&pcch_hdr->status);
- if (status != CMD_COMPLETE) {
- dprintk("target: FAILED for cpu %d, with status: 0x%x\n",
- cpu, status);
- goto cmd_incomplete;
- }
- iowrite16(0, &pcch_hdr->status);
-
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- dprintk("target: was SUCCESSFUL for cpu %d\n", cpu);
- spin_unlock(&pcc_lock);
-
- return 0;
-
-cmd_incomplete:
- iowrite16(0, &pcch_hdr->status);
- spin_unlock(&pcc_lock);
- return -EINVAL;
-}
-
-static int pcc_get_offset(int cpu)
-{
- acpi_status status;
- struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
- union acpi_object *pccp, *offset;
- struct pcc_cpu *pcc_cpu_data;
- struct acpi_processor *pr;
- int ret = 0;
-
- pr = per_cpu(processors, cpu);
- pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
-
- status = acpi_evaluate_object(pr->handle, "PCCP", NULL, &buffer);
- if (ACPI_FAILURE(status))
- return -ENODEV;
-
- pccp = buffer.pointer;
- if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) {
- ret = -ENODEV;
- goto out_free;
- };
-
- offset = &(pccp->package.elements[0]);
- if (!offset || offset->type != ACPI_TYPE_INTEGER) {
- ret = -ENODEV;
- goto out_free;
- }
-
- pcc_cpu_data->input_offset = offset->integer.value;
-
- offset = &(pccp->package.elements[1]);
- if (!offset || offset->type != ACPI_TYPE_INTEGER) {
- ret = -ENODEV;
- goto out_free;
- }
-
- pcc_cpu_data->output_offset = offset->integer.value;
-
- memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
- memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ);
-
- dprintk("pcc_get_offset: for CPU %d: pcc_cpu_data "
- "input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n",
- cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset);
-out_free:
- kfree(buffer.pointer);
- return ret;
-}
-
-static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
-{
- acpi_status status;
- struct acpi_object_list input;
- struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
- union acpi_object in_params[4];
- union acpi_object *out_obj;
- u32 capabilities[2];
- u32 errors;
- u32 supported;
- int ret = 0;
-
- input.count = 4;
- input.pointer = in_params;
- in_params[0].type = ACPI_TYPE_BUFFER;
- in_params[0].buffer.length = 16;
- in_params[0].buffer.pointer = OSC_UUID;
- in_params[1].type = ACPI_TYPE_INTEGER;
- in_params[1].integer.value = 1;
- in_params[2].type = ACPI_TYPE_INTEGER;
- in_params[2].integer.value = 2;
- in_params[3].type = ACPI_TYPE_BUFFER;
- in_params[3].buffer.length = 8;
- in_params[3].buffer.pointer = (u8 *)&capabilities;
-
- capabilities[0] = OSC_QUERY_ENABLE;
- capabilities[1] = 0x1;
-
- status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
- if (ACPI_FAILURE(status))
- return -ENODEV;
-
- if (!output.length)
- return -ENODEV;
-
- out_obj = output.pointer;
- if (out_obj->type != ACPI_TYPE_BUFFER) {
- ret = -ENODEV;
- goto out_free;
- }
-
- errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
- if (errors) {
- ret = -ENODEV;
- goto out_free;
- }
-
- supported = *((u32 *)(out_obj->buffer.pointer + 4));
- if (!(supported & 0x1)) {
- ret = -ENODEV;
- goto out_free;
- }
-
- kfree(output.pointer);
- capabilities[0] = 0x0;
- capabilities[1] = 0x1;
-
- status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
- if (ACPI_FAILURE(status))
- return -ENODEV;
-
- if (!output.length)
- return -ENODEV;
-
- out_obj = output.pointer;
- if (out_obj->type != ACPI_TYPE_BUFFER) {
- ret = -ENODEV;
- goto out_free;
- }
-
- errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
- if (errors) {
- ret = -ENODEV;
- goto out_free;
- }
-
- supported = *((u32 *)(out_obj->buffer.pointer + 4));
- if (!(supported & 0x1)) {
- ret = -ENODEV;
- goto out_free;
- }
-
-out_free:
- kfree(output.pointer);
- return ret;
-}
-
-static int __init pcc_cpufreq_probe(void)
-{
- acpi_status status;
- struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
- struct pcc_memory_resource *mem_resource;
- struct pcc_register_resource *reg_resource;
- union acpi_object *out_obj, *member;
- acpi_handle handle, osc_handle, pcch_handle;
- int ret = 0;
-
- status = acpi_get_handle(NULL, "\\_SB", &handle);
- if (ACPI_FAILURE(status))
- return -ENODEV;
-
- status = acpi_get_handle(handle, "PCCH", &pcch_handle);
- if (ACPI_FAILURE(status))
- return -ENODEV;
-
- status = acpi_get_handle(handle, "_OSC", &osc_handle);
- if (ACPI_SUCCESS(status)) {
- ret = pcc_cpufreq_do_osc(&osc_handle);
- if (ret)
- dprintk("probe: _OSC evaluation did not succeed\n");
- /* Firmware's use of _OSC is optional */
- ret = 0;
- }
-
- status = acpi_evaluate_object(handle, "PCCH", NULL, &output);
- if (ACPI_FAILURE(status))
- return -ENODEV;
-
- out_obj = output.pointer;
- if (out_obj->type != ACPI_TYPE_PACKAGE) {
- ret = -ENODEV;
- goto out_free;
- }
-
- member = &out_obj->package.elements[0];
- if (member->type != ACPI_TYPE_BUFFER) {
- ret = -ENODEV;
- goto out_free;
- }
-
- mem_resource = (struct pcc_memory_resource *)member->buffer.pointer;
-
- dprintk("probe: mem_resource descriptor: 0x%x,"
- " length: %d, space_id: %d, resource_usage: %d,"
- " type_specific: %d, granularity: 0x%llx,"
- " minimum: 0x%llx, maximum: 0x%llx,"
- " translation_offset: 0x%llx, address_length: 0x%llx\n",
- mem_resource->descriptor, mem_resource->length,
- mem_resource->space_id, mem_resource->resource_usage,
- mem_resource->type_specific, mem_resource->granularity,
- mem_resource->minimum, mem_resource->maximum,
- mem_resource->translation_offset,
- mem_resource->address_length);
-
- if (mem_resource->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) {
- ret = -ENODEV;
- goto out_free;
- }
-
- pcch_virt_addr = ioremap_nocache(mem_resource->minimum,
- mem_resource->address_length);
- if (pcch_virt_addr == NULL) {
- dprintk("probe: could not map shared mem region\n");
- goto out_free;
- }
- pcch_hdr = pcch_virt_addr;
-
- dprintk("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr);
- dprintk("probe: PCCH header is at physical address: 0x%llx,"
- " signature: 0x%x, length: %d bytes, major: %d, minor: %d,"
- " supported features: 0x%x, command field: 0x%x,"
- " status field: 0x%x, nominal latency: %d us\n",
- mem_resource->minimum, ioread32(&pcch_hdr->signature),
- ioread16(&pcch_hdr->length), ioread8(&pcch_hdr->major),
- ioread8(&pcch_hdr->minor), ioread32(&pcch_hdr->features),
- ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status),
- ioread32(&pcch_hdr->latency));
-
- dprintk("probe: min time between commands: %d us,"
- " max time between commands: %d us,"
- " nominal CPU frequency: %d MHz,"
- " minimum CPU frequency: %d MHz,"
- " minimum CPU frequency without throttling: %d MHz\n",
- ioread32(&pcch_hdr->minimum_time),
- ioread32(&pcch_hdr->maximum_time),
- ioread32(&pcch_hdr->nominal),
- ioread32(&pcch_hdr->throttled_frequency),
- ioread32(&pcch_hdr->minimum_frequency));
-
- member = &out_obj->package.elements[1];
- if (member->type != ACPI_TYPE_BUFFER) {
- ret = -ENODEV;
- goto pcch_free;
- }
-
- reg_resource = (struct pcc_register_resource *)member->buffer.pointer;
-
- doorbell.space_id = reg_resource->space_id;
- doorbell.bit_width = reg_resource->bit_width;
- doorbell.bit_offset = reg_resource->bit_offset;
- doorbell.access_width = 64;
- doorbell.address = reg_resource->address;
-
- dprintk("probe: doorbell: space_id is %d, bit_width is %d, "
- "bit_offset is %d, access_width is %d, address is 0x%llx\n",
- doorbell.space_id, doorbell.bit_width, doorbell.bit_offset,
- doorbell.access_width, reg_resource->address);
-
- member = &out_obj->package.elements[2];
- if (member->type != ACPI_TYPE_INTEGER) {
- ret = -ENODEV;
- goto pcch_free;
- }
-
- doorbell_preserve = member->integer.value;
-
- member = &out_obj->package.elements[3];
- if (member->type != ACPI_TYPE_INTEGER) {
- ret = -ENODEV;
- goto pcch_free;
- }
-
- doorbell_write = member->integer.value;
-
- dprintk("probe: doorbell_preserve: 0x%llx,"
- " doorbell_write: 0x%llx\n",
- doorbell_preserve, doorbell_write);
-
- pcc_cpu_info = alloc_percpu(struct pcc_cpu);
- if (!pcc_cpu_info) {
- ret = -ENOMEM;
- goto pcch_free;
- }
-
- printk(KERN_DEBUG "pcc-cpufreq: (v%s) driver loaded with frequency"
- " limits: %d MHz, %d MHz\n", PCC_VERSION,
- ioread32(&pcch_hdr->minimum_frequency),
- ioread32(&pcch_hdr->nominal));
- kfree(output.pointer);
- return ret;
-pcch_free:
- pcc_clear_mapping();
-out_free:
- kfree(output.pointer);
- return ret;
-}
-
-static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
-{
- unsigned int cpu = policy->cpu;
- unsigned int result = 0;
-
- if (!pcch_virt_addr) {
- result = -1;
- goto out;
- }
-
- result = pcc_get_offset(cpu);
- if (result) {
- dprintk("init: PCCP evaluation failed\n");
- goto out;
- }
-
- policy->max = policy->cpuinfo.max_freq =
- ioread32(&pcch_hdr->nominal) * 1000;
- policy->min = policy->cpuinfo.min_freq =
- ioread32(&pcch_hdr->minimum_frequency) * 1000;
- policy->cur = pcc_get_freq(cpu);
-
- if (!policy->cur) {
- dprintk("init: Unable to get current CPU frequency\n");
- result = -EINVAL;
- goto out;
- }
-
- dprintk("init: policy->max is %d, policy->min is %d\n",
- policy->max, policy->min);
-out:
- return result;
-}
-
-static int pcc_cpufreq_cpu_exit(struct cpufreq_policy *policy)
-{
- return 0;
-}
-
-static struct cpufreq_driver pcc_cpufreq_driver = {
- .flags = CPUFREQ_CONST_LOOPS,
- .get = pcc_get_freq,
- .verify = pcc_cpufreq_verify,
- .target = pcc_cpufreq_target,
- .init = pcc_cpufreq_cpu_init,
- .exit = pcc_cpufreq_cpu_exit,
- .name = "pcc-cpufreq",
- .owner = THIS_MODULE,
-};
-
-static int __init pcc_cpufreq_init(void)
-{
- int ret;
-
- if (acpi_disabled)
- return 0;
-
- ret = pcc_cpufreq_probe();
- if (ret) {
- dprintk("pcc_cpufreq_init: PCCH evaluation failed\n");
- return ret;
- }
-
- ret = cpufreq_register_driver(&pcc_cpufreq_driver);
-
- return ret;
-}
-
-static void __exit pcc_cpufreq_exit(void)
-{
- cpufreq_unregister_driver(&pcc_cpufreq_driver);
-
- pcc_clear_mapping();
-
- free_percpu(pcc_cpu_info);
-}
-
-MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar");
-MODULE_VERSION(PCC_VERSION);
-MODULE_DESCRIPTION("Processor Clocking Control interface driver");
-MODULE_LICENSE("GPL");
-
-late_initcall(pcc_cpufreq_init);
-module_exit(pcc_cpufreq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
deleted file mode 100644
index b3379d6a5c5..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ /dev/null
@@ -1,261 +0,0 @@
-/*
- * This file was based upon code in Powertweak Linux (http://powertweak.sf.net)
- * (C) 2000-2003 Dave Jones, Arjan van de Ven, Janne Pänkälä,
- * Dominik Brodowski.
- *
- * Licensed under the terms of the GNU GPL License version 2.
- *
- * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/ioport.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-
-#include <asm/msr.h>
-
-#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long
- as it is unused */
-
-#define PFX "powernow-k6: "
-static unsigned int busfreq; /* FSB, in 10 kHz */
-static unsigned int max_multiplier;
-
-
-/* Clock ratio multiplied by 10 - see table 27 in AMD#23446 */
-static struct cpufreq_frequency_table clock_ratio[] = {
- {45, /* 000 -> 4.5x */ 0},
- {50, /* 001 -> 5.0x */ 0},
- {40, /* 010 -> 4.0x */ 0},
- {55, /* 011 -> 5.5x */ 0},
- {20, /* 100 -> 2.0x */ 0},
- {30, /* 101 -> 3.0x */ 0},
- {60, /* 110 -> 6.0x */ 0},
- {35, /* 111 -> 3.5x */ 0},
- {0, CPUFREQ_TABLE_END}
-};
-
-
-/**
- * powernow_k6_get_cpu_multiplier - returns the current FSB multiplier
- *
- * Returns the current setting of the frequency multiplier. Core clock
- * speed is frequency of the Front-Side Bus multiplied with this value.
- */
-static int powernow_k6_get_cpu_multiplier(void)
-{
- u64 invalue = 0;
- u32 msrval;
-
- msrval = POWERNOW_IOPORT + 0x1;
- wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
- invalue = inl(POWERNOW_IOPORT + 0x8);
- msrval = POWERNOW_IOPORT + 0x0;
- wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
-
- return clock_ratio[(invalue >> 5)&7].index;
-}
-
-
-/**
- * powernow_k6_set_state - set the PowerNow! multiplier
- * @best_i: clock_ratio[best_i] is the target multiplier
- *
- * Tries to change the PowerNow! multiplier
- */
-static void powernow_k6_set_state(unsigned int best_i)
-{
- unsigned long outvalue = 0, invalue = 0;
- unsigned long msrval;
- struct cpufreq_freqs freqs;
-
- if (clock_ratio[best_i].index > max_multiplier) {
- printk(KERN_ERR PFX "invalid target frequency\n");
- return;
- }
-
- freqs.old = busfreq * powernow_k6_get_cpu_multiplier();
- freqs.new = busfreq * clock_ratio[best_i].index;
- freqs.cpu = 0; /* powernow-k6.c is UP only driver */
-
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
- /* we now need to transform best_i to the BVC format, see AMD#23446 */
-
- outvalue = (1<<12) | (1<<10) | (1<<9) | (best_i<<5);
-
- msrval = POWERNOW_IOPORT + 0x1;
- wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
- invalue = inl(POWERNOW_IOPORT + 0x8);
- invalue = invalue & 0xf;
- outvalue = outvalue | invalue;
- outl(outvalue , (POWERNOW_IOPORT + 0x8));
- msrval = POWERNOW_IOPORT + 0x0;
- wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
-
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-
- return;
-}
-
-
-/**
- * powernow_k6_verify - verifies a new CPUfreq policy
- * @policy: new policy
- *
- * Policy must be within lowest and highest possible CPU Frequency,
- * and at least one possible state must be within min and max.
- */
-static int powernow_k6_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy, &clock_ratio[0]);
-}
-
-
-/**
- * powernow_k6_setpolicy - sets a new CPUFreq policy
- * @policy: new policy
- * @target_freq: the target frequency
- * @relation: how that frequency relates to achieved frequency
- * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
- *
- * sets a new CPUFreq policy
- */
-static int powernow_k6_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- unsigned int newstate = 0;
-
- if (cpufreq_frequency_table_target(policy, &clock_ratio[0],
- target_freq, relation, &newstate))
- return -EINVAL;
-
- powernow_k6_set_state(newstate);
-
- return 0;
-}
-
-
-static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
-{
- unsigned int i, f;
- int result;
-
- if (policy->cpu != 0)
- return -ENODEV;
-
- /* get frequencies */
- max_multiplier = powernow_k6_get_cpu_multiplier();
- busfreq = cpu_khz / max_multiplier;
-
- /* table init */
- for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) {
- f = clock_ratio[i].index;
- if (f > max_multiplier)
- clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID;
- else
- clock_ratio[i].frequency = busfreq * f;
- }
-
- /* cpuinfo and default policy values */
- policy->cpuinfo.transition_latency = 200000;
- policy->cur = busfreq * max_multiplier;
-
- result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio);
- if (result)
- return result;
-
- cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu);
-
- return 0;
-}
-
-
-static int powernow_k6_cpu_exit(struct cpufreq_policy *policy)
-{
- unsigned int i;
- for (i = 0; i < 8; i++) {
- if (i == max_multiplier)
- powernow_k6_set_state(i);
- }
- cpufreq_frequency_table_put_attr(policy->cpu);
- return 0;
-}
-
-static unsigned int powernow_k6_get(unsigned int cpu)
-{
- unsigned int ret;
- ret = (busfreq * powernow_k6_get_cpu_multiplier());
- return ret;
-}
-
-static struct freq_attr *powernow_k6_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-static struct cpufreq_driver powernow_k6_driver = {
- .verify = powernow_k6_verify,
- .target = powernow_k6_target,
- .init = powernow_k6_cpu_init,
- .exit = powernow_k6_cpu_exit,
- .get = powernow_k6_get,
- .name = "powernow-k6",
- .owner = THIS_MODULE,
- .attr = powernow_k6_attr,
-};
-
-
-/**
- * powernow_k6_init - initializes the k6 PowerNow! CPUFreq driver
- *
- * Initializes the K6 PowerNow! support. Returns -ENODEV on unsupported
- * devices, -EINVAL or -ENOMEM on problems during initiatization, and zero
- * on success.
- */
-static int __init powernow_k6_init(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
-
- if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 5) ||
- ((c->x86_model != 12) && (c->x86_model != 13)))
- return -ENODEV;
-
- if (!request_region(POWERNOW_IOPORT, 16, "PowerNow!")) {
- printk(KERN_INFO PFX "PowerNow IOPORT region already used.\n");
- return -EIO;
- }
-
- if (cpufreq_register_driver(&powernow_k6_driver)) {
- release_region(POWERNOW_IOPORT, 16);
- return -EINVAL;
- }
-
- return 0;
-}
-
-
-/**
- * powernow_k6_exit - unregisters AMD K6-2+/3+ PowerNow! support
- *
- * Unregisters AMD K6-2+ / K6-3+ PowerNow! support.
- */
-static void __exit powernow_k6_exit(void)
-{
- cpufreq_unregister_driver(&powernow_k6_driver);
- release_region(POWERNOW_IOPORT, 16);
-}
-
-
-MODULE_AUTHOR("Arjan van de Ven, Dave Jones <davej@redhat.com>, "
- "Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
-MODULE_LICENSE("GPL");
-
-module_init(powernow_k6_init);
-module_exit(powernow_k6_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
deleted file mode 100644
index 4a45fd6e41b..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ /dev/null
@@ -1,752 +0,0 @@
-/*
- * AMD K7 Powernow driver.
- * (C) 2003 Dave Jones on behalf of SuSE Labs.
- * (C) 2003-2004 Dave Jones <davej@redhat.com>
- *
- * Licensed under the terms of the GNU GPL License version 2.
- * Based upon datasheets & sample CPUs kindly provided by AMD.
- *
- * Errata 5:
- * CPU may fail to execute a FID/VID change in presence of interrupt.
- * - We cli/sti on stepping A0 CPUs around the FID/VID transition.
- * Errata 15:
- * CPU with half frequency multipliers may hang upon wakeup from disconnect.
- * - We disable half multipliers if ACPI is used on A0 stepping CPUs.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/dmi.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-
-#include <asm/timer.h> /* Needed for recalibrate_cpu_khz() */
-#include <asm/msr.h>
-#include <asm/system.h>
-
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-#include <linux/acpi.h>
-#include <acpi/processor.h>
-#endif
-
-#include "powernow-k7.h"
-
-#define PFX "powernow: "
-
-
-struct psb_s {
- u8 signature[10];
- u8 tableversion;
- u8 flags;
- u16 settlingtime;
- u8 reserved1;
- u8 numpst;
-};
-
-struct pst_s {
- u32 cpuid;
- u8 fsbspeed;
- u8 maxfid;
- u8 startvid;
- u8 numpstates;
-};
-
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-union powernow_acpi_control_t {
- struct {
- unsigned long fid:5,
- vid:5,
- sgtc:20,
- res1:2;
- } bits;
- unsigned long val;
-};
-#endif
-
-#ifdef CONFIG_CPU_FREQ_DEBUG
-/* divide by 1000 to get VCore voltage in V. */
-static const int mobile_vid_table[32] = {
- 2000, 1950, 1900, 1850, 1800, 1750, 1700, 1650,
- 1600, 1550, 1500, 1450, 1400, 1350, 1300, 0,
- 1275, 1250, 1225, 1200, 1175, 1150, 1125, 1100,
- 1075, 1050, 1025, 1000, 975, 950, 925, 0,
-};
-#endif
-
-/* divide by 10 to get FID. */
-static const int fid_codes[32] = {
- 110, 115, 120, 125, 50, 55, 60, 65,
- 70, 75, 80, 85, 90, 95, 100, 105,
- 30, 190, 40, 200, 130, 135, 140, 210,
- 150, 225, 160, 165, 170, 180, -1, -1,
-};
-
-/* This parameter is used in order to force ACPI instead of legacy method for
- * configuration purpose.
- */
-
-static int acpi_force;
-
-static struct cpufreq_frequency_table *powernow_table;
-
-static unsigned int can_scale_bus;
-static unsigned int can_scale_vid;
-static unsigned int minimum_speed = -1;
-static unsigned int maximum_speed;
-static unsigned int number_scales;
-static unsigned int fsb;
-static unsigned int latency;
-static char have_a0;
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "powernow-k7", msg)
-
-static int check_fsb(unsigned int fsbspeed)
-{
- int delta;
- unsigned int f = fsb / 1000;
-
- delta = (fsbspeed > f) ? fsbspeed - f : f - fsbspeed;
- return delta < 5;
-}
-
-static int check_powernow(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
- unsigned int maxei, eax, ebx, ecx, edx;
-
- if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 6)) {
-#ifdef MODULE
- printk(KERN_INFO PFX "This module only works with "
- "AMD K7 CPUs\n");
-#endif
- return 0;
- }
-
- /* Get maximum capabilities */
- maxei = cpuid_eax(0x80000000);
- if (maxei < 0x80000007) { /* Any powernow info ? */
-#ifdef MODULE
- printk(KERN_INFO PFX "No powernow capabilities detected\n");
-#endif
- return 0;
- }
-
- if ((c->x86_model == 6) && (c->x86_mask == 0)) {
- printk(KERN_INFO PFX "K7 660[A0] core detected, "
- "enabling errata workarounds\n");
- have_a0 = 1;
- }
-
- cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
-
- /* Check we can actually do something before we say anything.*/
- if (!(edx & (1 << 1 | 1 << 2)))
- return 0;
-
- printk(KERN_INFO PFX "PowerNOW! Technology present. Can scale: ");
-
- if (edx & 1 << 1) {
- printk("frequency");
- can_scale_bus = 1;
- }
-
- if ((edx & (1 << 1 | 1 << 2)) == 0x6)
- printk(" and ");
-
- if (edx & 1 << 2) {
- printk("voltage");
- can_scale_vid = 1;
- }
-
- printk(".\n");
- return 1;
-}
-
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-static void invalidate_entry(unsigned int entry)
-{
- powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
-}
-#endif
-
-static int get_ranges(unsigned char *pst)
-{
- unsigned int j;
- unsigned int speed;
- u8 fid, vid;
-
- powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) *
- (number_scales + 1)), GFP_KERNEL);
- if (!powernow_table)
- return -ENOMEM;
-
- for (j = 0 ; j < number_scales; j++) {
- fid = *pst++;
-
- powernow_table[j].frequency = (fsb * fid_codes[fid]) / 10;
- powernow_table[j].index = fid; /* lower 8 bits */
-
- speed = powernow_table[j].frequency;
-
- if ((fid_codes[fid] % 10) == 5) {
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
- if (have_a0 == 1)
- invalidate_entry(j);
-#endif
- }
-
- if (speed < minimum_speed)
- minimum_speed = speed;
- if (speed > maximum_speed)
- maximum_speed = speed;
-
- vid = *pst++;
- powernow_table[j].index |= (vid << 8); /* upper 8 bits */
-
- dprintk(" FID: 0x%x (%d.%dx [%dMHz]) "
- "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
- fid_codes[fid] % 10, speed/1000, vid,
- mobile_vid_table[vid]/1000,
- mobile_vid_table[vid]%1000);
- }
- powernow_table[number_scales].frequency = CPUFREQ_TABLE_END;
- powernow_table[number_scales].index = 0;
-
- return 0;
-}
-
-
-static void change_FID(int fid)
-{
- union msr_fidvidctl fidvidctl;
-
- rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
- if (fidvidctl.bits.FID != fid) {
- fidvidctl.bits.SGTC = latency;
- fidvidctl.bits.FID = fid;
- fidvidctl.bits.VIDC = 0;
- fidvidctl.bits.FIDC = 1;
- wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
- }
-}
-
-
-static void change_VID(int vid)
-{
- union msr_fidvidctl fidvidctl;
-
- rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
- if (fidvidctl.bits.VID != vid) {
- fidvidctl.bits.SGTC = latency;
- fidvidctl.bits.VID = vid;
- fidvidctl.bits.FIDC = 0;
- fidvidctl.bits.VIDC = 1;
- wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
- }
-}
-
-
-static void change_speed(unsigned int index)
-{
- u8 fid, vid;
- struct cpufreq_freqs freqs;
- union msr_fidvidstatus fidvidstatus;
- int cfid;
-
- /* fid are the lower 8 bits of the index we stored into
- * the cpufreq frequency table in powernow_decode_bios,
- * vid are the upper 8 bits.
- */
-
- fid = powernow_table[index].index & 0xFF;
- vid = (powernow_table[index].index & 0xFF00) >> 8;
-
- freqs.cpu = 0;
-
- rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
- cfid = fidvidstatus.bits.CFID;
- freqs.old = fsb * fid_codes[cfid] / 10;
-
- freqs.new = powernow_table[index].frequency;
-
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
- /* Now do the magic poking into the MSRs. */
-
- if (have_a0 == 1) /* A0 errata 5 */
- local_irq_disable();
-
- if (freqs.old > freqs.new) {
- /* Going down, so change FID first */
- change_FID(fid);
- change_VID(vid);
- } else {
- /* Going up, so change VID first */
- change_VID(vid);
- change_FID(fid);
- }
-
-
- if (have_a0 == 1)
- local_irq_enable();
-
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-}
-
-
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-
-static struct acpi_processor_performance *acpi_processor_perf;
-
-static int powernow_acpi_init(void)
-{
- int i;
- int retval = 0;
- union powernow_acpi_control_t pc;
-
- if (acpi_processor_perf != NULL && powernow_table != NULL) {
- retval = -EINVAL;
- goto err0;
- }
-
- acpi_processor_perf = kzalloc(sizeof(struct acpi_processor_performance),
- GFP_KERNEL);
- if (!acpi_processor_perf) {
- retval = -ENOMEM;
- goto err0;
- }
-
- if (!zalloc_cpumask_var(&acpi_processor_perf->shared_cpu_map,
- GFP_KERNEL)) {
- retval = -ENOMEM;
- goto err05;
- }
-
- if (acpi_processor_register_performance(acpi_processor_perf, 0)) {
- retval = -EIO;
- goto err1;
- }
-
- if (acpi_processor_perf->control_register.space_id !=
- ACPI_ADR_SPACE_FIXED_HARDWARE) {
- retval = -ENODEV;
- goto err2;
- }
-
- if (acpi_processor_perf->status_register.space_id !=
- ACPI_ADR_SPACE_FIXED_HARDWARE) {
- retval = -ENODEV;
- goto err2;
- }
-
- number_scales = acpi_processor_perf->state_count;
-
- if (number_scales < 2) {
- retval = -ENODEV;
- goto err2;
- }
-
- powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) *
- (number_scales + 1)), GFP_KERNEL);
- if (!powernow_table) {
- retval = -ENOMEM;
- goto err2;
- }
-
- pc.val = (unsigned long) acpi_processor_perf->states[0].control;
- for (i = 0; i < number_scales; i++) {
- u8 fid, vid;
- struct acpi_processor_px *state =
- &acpi_processor_perf->states[i];
- unsigned int speed, speed_mhz;
-
- pc.val = (unsigned long) state->control;
- dprintk("acpi: P%d: %d MHz %d mW %d uS control %08x SGTC %d\n",
- i,
- (u32) state->core_frequency,
- (u32) state->power,
- (u32) state->transition_latency,
- (u32) state->control,
- pc.bits.sgtc);
-
- vid = pc.bits.vid;
- fid = pc.bits.fid;
-
- powernow_table[i].frequency = fsb * fid_codes[fid] / 10;
- powernow_table[i].index = fid; /* lower 8 bits */
- powernow_table[i].index |= (vid << 8); /* upper 8 bits */
-
- speed = powernow_table[i].frequency;
- speed_mhz = speed / 1000;
-
- /* processor_perflib will multiply the MHz value by 1000 to
- * get a KHz value (e.g. 1266000). However, powernow-k7 works
- * with true KHz values (e.g. 1266768). To ensure that all
- * powernow frequencies are available, we must ensure that
- * ACPI doesn't restrict them, so we round up the MHz value
- * to ensure that perflib's computed KHz value is greater than
- * or equal to powernow's KHz value.
- */
- if (speed % 1000 > 0)
- speed_mhz++;
-
- if ((fid_codes[fid] % 10) == 5) {
- if (have_a0 == 1)
- invalidate_entry(i);
- }
-
- dprintk(" FID: 0x%x (%d.%dx [%dMHz]) "
- "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
- fid_codes[fid] % 10, speed_mhz, vid,
- mobile_vid_table[vid]/1000,
- mobile_vid_table[vid]%1000);
-
- if (state->core_frequency != speed_mhz) {
- state->core_frequency = speed_mhz;
- dprintk(" Corrected ACPI frequency to %d\n",
- speed_mhz);
- }
-
- if (latency < pc.bits.sgtc)
- latency = pc.bits.sgtc;
-
- if (speed < minimum_speed)
- minimum_speed = speed;
- if (speed > maximum_speed)
- maximum_speed = speed;
- }
-
- powernow_table[i].frequency = CPUFREQ_TABLE_END;
- powernow_table[i].index = 0;
-
- /* notify BIOS that we exist */
- acpi_processor_notify_smm(THIS_MODULE);
-
- return 0;
-
-err2:
- acpi_processor_unregister_performance(acpi_processor_perf, 0);
-err1:
- free_cpumask_var(acpi_processor_perf->shared_cpu_map);
-err05:
- kfree(acpi_processor_perf);
-err0:
- printk(KERN_WARNING PFX "ACPI perflib can not be used on "
- "this platform\n");
- acpi_processor_perf = NULL;
- return retval;
-}
-#else
-static int powernow_acpi_init(void)
-{
- printk(KERN_INFO PFX "no support for ACPI processor found."
- " Please recompile your kernel with ACPI processor\n");
- return -EINVAL;
-}
-#endif
-
-static void print_pst_entry(struct pst_s *pst, unsigned int j)
-{
- dprintk("PST:%d (@%p)\n", j, pst);
- dprintk(" cpuid: 0x%x fsb: %d maxFID: 0x%x startvid: 0x%x\n",
- pst->cpuid, pst->fsbspeed, pst->maxfid, pst->startvid);
-}
-
-static int powernow_decode_bios(int maxfid, int startvid)
-{
- struct psb_s *psb;
- struct pst_s *pst;
- unsigned int i, j;
- unsigned char *p;
- unsigned int etuple;
- unsigned int ret;
-
- etuple = cpuid_eax(0x80000001);
-
- for (i = 0xC0000; i < 0xffff0 ; i += 16) {
-
- p = phys_to_virt(i);
-
- if (memcmp(p, "AMDK7PNOW!", 10) == 0) {
- dprintk("Found PSB header at %p\n", p);
- psb = (struct psb_s *) p;
- dprintk("Table version: 0x%x\n", psb->tableversion);
- if (psb->tableversion != 0x12) {
- printk(KERN_INFO PFX "Sorry, only v1.2 tables"
- " supported right now\n");
- return -ENODEV;
- }
-
- dprintk("Flags: 0x%x\n", psb->flags);
- if ((psb->flags & 1) == 0)
- dprintk("Mobile voltage regulator\n");
- else
- dprintk("Desktop voltage regulator\n");
-
- latency = psb->settlingtime;
- if (latency < 100) {
- printk(KERN_INFO PFX "BIOS set settling time "
- "to %d microseconds. "
- "Should be at least 100. "
- "Correcting.\n", latency);
- latency = 100;
- }
- dprintk("Settling Time: %d microseconds.\n",
- psb->settlingtime);
- dprintk("Has %d PST tables. (Only dumping ones "
- "relevant to this CPU).\n",
- psb->numpst);
-
- p += sizeof(struct psb_s);
-
- pst = (struct pst_s *) p;
-
- for (j = 0; j < psb->numpst; j++) {
- pst = (struct pst_s *) p;
- number_scales = pst->numpstates;
-
- if ((etuple == pst->cpuid) &&
- check_fsb(pst->fsbspeed) &&
- (maxfid == pst->maxfid) &&
- (startvid == pst->startvid)) {
- print_pst_entry(pst, j);
- p = (char *)pst + sizeof(struct pst_s);
- ret = get_ranges(p);
- return ret;
- } else {
- unsigned int k;
- p = (char *)pst + sizeof(struct pst_s);
- for (k = 0; k < number_scales; k++)
- p += 2;
- }
- }
- printk(KERN_INFO PFX "No PST tables match this cpuid "
- "(0x%x)\n", etuple);
- printk(KERN_INFO PFX "This is indicative of a broken "
- "BIOS.\n");
-
- return -EINVAL;
- }
- p++;
- }
-
- return -ENODEV;
-}
-
-
-static int powernow_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- unsigned int newstate;
-
- if (cpufreq_frequency_table_target(policy, powernow_table, target_freq,
- relation, &newstate))
- return -EINVAL;
-
- change_speed(newstate);
-
- return 0;
-}
-
-
-static int powernow_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy, powernow_table);
-}
-
-/*
- * We use the fact that the bus frequency is somehow
- * a multiple of 100000/3 khz, then we compute sgtc according
- * to this multiple.
- * That way, we match more how AMD thinks all of that work.
- * We will then get the same kind of behaviour already tested under
- * the "well-known" other OS.
- */
-static int __cpuinit fixup_sgtc(void)
-{
- unsigned int sgtc;
- unsigned int m;
-
- m = fsb / 3333;
- if ((m % 10) >= 5)
- m += 5;
-
- m /= 10;
-
- sgtc = 100 * m * latency;
- sgtc = sgtc / 3;
- if (sgtc > 0xfffff) {
- printk(KERN_WARNING PFX "SGTC too large %d\n", sgtc);
- sgtc = 0xfffff;
- }
- return sgtc;
-}
-
-static unsigned int powernow_get(unsigned int cpu)
-{
- union msr_fidvidstatus fidvidstatus;
- unsigned int cfid;
-
- if (cpu)
- return 0;
- rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
- cfid = fidvidstatus.bits.CFID;
-
- return fsb * fid_codes[cfid] / 10;
-}
-
-
-static int __cpuinit acer_cpufreq_pst(const struct dmi_system_id *d)
-{
- printk(KERN_WARNING PFX
- "%s laptop with broken PST tables in BIOS detected.\n",
- d->ident);
- printk(KERN_WARNING PFX
- "You need to downgrade to 3A21 (09/09/2002), or try a newer "
- "BIOS than 3A71 (01/20/2003)\n");
- printk(KERN_WARNING PFX
- "cpufreq scaling has been disabled as a result of this.\n");
- return 0;
-}
-
-/*
- * Some Athlon laptops have really fucked PST tables.
- * A BIOS update is all that can save them.
- * Mention this, and disable cpufreq.
- */
-static struct dmi_system_id __cpuinitdata powernow_dmi_table[] = {
- {
- .callback = acer_cpufreq_pst,
- .ident = "Acer Aspire",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Insyde Software"),
- DMI_MATCH(DMI_BIOS_VERSION, "3A71"),
- },
- },
- { }
-};
-
-static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy)
-{
- union msr_fidvidstatus fidvidstatus;
- int result;
-
- if (policy->cpu != 0)
- return -ENODEV;
-
- rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
-
- recalibrate_cpu_khz();
-
- fsb = (10 * cpu_khz) / fid_codes[fidvidstatus.bits.CFID];
- if (!fsb) {
- printk(KERN_WARNING PFX "can not determine bus frequency\n");
- return -EINVAL;
- }
- dprintk("FSB: %3dMHz\n", fsb/1000);
-
- if (dmi_check_system(powernow_dmi_table) || acpi_force) {
- printk(KERN_INFO PFX "PSB/PST known to be broken. "
- "Trying ACPI instead\n");
- result = powernow_acpi_init();
- } else {
- result = powernow_decode_bios(fidvidstatus.bits.MFID,
- fidvidstatus.bits.SVID);
- if (result) {
- printk(KERN_INFO PFX "Trying ACPI perflib\n");
- maximum_speed = 0;
- minimum_speed = -1;
- latency = 0;
- result = powernow_acpi_init();
- if (result) {
- printk(KERN_INFO PFX
- "ACPI and legacy methods failed\n");
- }
- } else {
- /* SGTC use the bus clock as timer */
- latency = fixup_sgtc();
- printk(KERN_INFO PFX "SGTC: %d\n", latency);
- }
- }
-
- if (result)
- return result;
-
- printk(KERN_INFO PFX "Minimum speed %d MHz. Maximum speed %d MHz.\n",
- minimum_speed/1000, maximum_speed/1000);
-
- policy->cpuinfo.transition_latency =
- cpufreq_scale(2000000UL, fsb, latency);
-
- policy->cur = powernow_get(0);
-
- cpufreq_frequency_table_get_attr(powernow_table, policy->cpu);
-
- return cpufreq_frequency_table_cpuinfo(policy, powernow_table);
-}
-
-static int powernow_cpu_exit(struct cpufreq_policy *policy)
-{
- cpufreq_frequency_table_put_attr(policy->cpu);
-
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
- if (acpi_processor_perf) {
- acpi_processor_unregister_performance(acpi_processor_perf, 0);
- free_cpumask_var(acpi_processor_perf->shared_cpu_map);
- kfree(acpi_processor_perf);
- }
-#endif
-
- kfree(powernow_table);
- return 0;
-}
-
-static struct freq_attr *powernow_table_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-static struct cpufreq_driver powernow_driver = {
- .verify = powernow_verify,
- .target = powernow_target,
- .get = powernow_get,
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
- .bios_limit = acpi_processor_get_bios_limit,
-#endif
- .init = powernow_cpu_init,
- .exit = powernow_cpu_exit,
- .name = "powernow-k7",
- .owner = THIS_MODULE,
- .attr = powernow_table_attr,
-};
-
-static int __init powernow_init(void)
-{
- if (check_powernow() == 0)
- return -ENODEV;
- return cpufreq_register_driver(&powernow_driver);
-}
-
-
-static void __exit powernow_exit(void)
-{
- cpufreq_unregister_driver(&powernow_driver);
-}
-
-module_param(acpi_force, int, 0444);
-MODULE_PARM_DESC(acpi_force, "Force ACPI to be used.");
-
-MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
-MODULE_DESCRIPTION("Powernow driver for AMD K7 processors.");
-MODULE_LICENSE("GPL");
-
-late_initcall(powernow_init);
-module_exit(powernow_exit);
-
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
deleted file mode 100644
index 35fb4eaf6e1..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * (C) 2003 Dave Jones.
- *
- * Licensed under the terms of the GNU GPL License version 2.
- *
- * AMD-specific information
- *
- */
-
-union msr_fidvidctl {
- struct {
- unsigned FID:5, // 4:0
- reserved1:3, // 7:5
- VID:5, // 12:8
- reserved2:3, // 15:13
- FIDC:1, // 16
- VIDC:1, // 17
- reserved3:2, // 19:18
- FIDCHGRATIO:1, // 20
- reserved4:11, // 31-21
- SGTC:20, // 32:51
- reserved5:12; // 63:52
- } bits;
- unsigned long long val;
-};
-
-union msr_fidvidstatus {
- struct {
- unsigned CFID:5, // 4:0
- reserved1:3, // 7:5
- SFID:5, // 12:8
- reserved2:3, // 15:13
- MFID:5, // 20:16
- reserved3:11, // 31:21
- CVID:5, // 36:32
- reserved4:3, // 39:37
- SVID:5, // 44:40
- reserved5:3, // 47:45
- MVID:5, // 52:48
- reserved6:11; // 63:53
- } bits;
- unsigned long long val;
-};
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
deleted file mode 100644
index 2368e38327b..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ /dev/null
@@ -1,1607 +0,0 @@
-/*
- * (c) 2003-2010 Advanced Micro Devices, Inc.
- * Your use of this code is subject to the terms and conditions of the
- * GNU general public license version 2. See "COPYING" or
- * http://www.gnu.org/licenses/gpl.html
- *
- * Support : mark.langsdorf@amd.com
- *
- * Based on the powernow-k7.c module written by Dave Jones.
- * (C) 2003 Dave Jones on behalf of SuSE Labs
- * (C) 2004 Dominik Brodowski <linux@brodo.de>
- * (C) 2004 Pavel Machek <pavel@ucw.cz>
- * Licensed under the terms of the GNU GPL License version 2.
- * Based upon datasheets & sample CPUs kindly provided by AMD.
- *
- * Valuable input gratefully received from Dave Jones, Pavel Machek,
- * Dominik Brodowski, Jacob Shin, and others.
- * Originally developed by Paul Devriendt.
- * Processor information obtained from Chapter 9 (Power and Thermal Management)
- * of the "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
- * Opteron Processors" available for download from www.amd.com
- *
- * Tables for specific CPUs can be inferred from
- * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/30430.pdf
- */
-
-#include <linux/kernel.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/cpumask.h>
-#include <linux/sched.h> /* for current / set_cpus_allowed() */
-#include <linux/io.h>
-#include <linux/delay.h>
-
-#include <asm/msr.h>
-
-#include <linux/acpi.h>
-#include <linux/mutex.h>
-#include <acpi/processor.h>
-
-#define PFX "powernow-k8: "
-#define VERSION "version 2.20.00"
-#include "powernow-k8.h"
-#include "mperf.h"
-
-/* serialize freq changes */
-static DEFINE_MUTEX(fidvid_mutex);
-
-static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
-
-static int cpu_family = CPU_OPTERON;
-
-/* core performance boost */
-static bool cpb_capable, cpb_enabled;
-static struct msr __percpu *msrs;
-
-static struct cpufreq_driver cpufreq_amd64_driver;
-
-#ifndef CONFIG_SMP
-static inline const struct cpumask *cpu_core_mask(int cpu)
-{
- return cpumask_of(0);
-}
-#endif
-
-/* Return a frequency in MHz, given an input fid */
-static u32 find_freq_from_fid(u32 fid)
-{
- return 800 + (fid * 100);
-}
-
-/* Return a frequency in KHz, given an input fid */
-static u32 find_khz_freq_from_fid(u32 fid)
-{
- return 1000 * find_freq_from_fid(fid);
-}
-
-static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data,
- u32 pstate)
-{
- return data[pstate].frequency;
-}
-
-/* Return the vco fid for an input fid
- *
- * Each "low" fid has corresponding "high" fid, and you can get to "low" fids
- * only from corresponding high fids. This returns "high" fid corresponding to
- * "low" one.
- */
-static u32 convert_fid_to_vco_fid(u32 fid)
-{
- if (fid < HI_FID_TABLE_BOTTOM)
- return 8 + (2 * fid);
- else
- return fid;
-}
-
-/*
- * Return 1 if the pending bit is set. Unless we just instructed the processor
- * to transition to a new state, seeing this bit set is really bad news.
- */
-static int pending_bit_stuck(void)
-{
- u32 lo, hi;
-
- if (cpu_family == CPU_HW_PSTATE)
- return 0;
-
- rdmsr(MSR_FIDVID_STATUS, lo, hi);
- return lo & MSR_S_LO_CHANGE_PENDING ? 1 : 0;
-}
-
-/*
- * Update the global current fid / vid values from the status msr.
- * Returns 1 on error.
- */
-static int query_current_values_with_pending_wait(struct powernow_k8_data *data)
-{
- u32 lo, hi;
- u32 i = 0;
-
- if (cpu_family == CPU_HW_PSTATE) {
- rdmsr(MSR_PSTATE_STATUS, lo, hi);
- i = lo & HW_PSTATE_MASK;
- data->currpstate = i;
-
- /*
- * a workaround for family 11h erratum 311 might cause
- * an "out-of-range Pstate if the core is in Pstate-0
- */
- if ((boot_cpu_data.x86 == 0x11) && (i >= data->numps))
- data->currpstate = HW_PSTATE_0;
-
- return 0;
- }
- do {
- if (i++ > 10000) {
- dprintk("detected change pending stuck\n");
- return 1;
- }
- rdmsr(MSR_FIDVID_STATUS, lo, hi);
- } while (lo & MSR_S_LO_CHANGE_PENDING);
-
- data->currvid = hi & MSR_S_HI_CURRENT_VID;
- data->currfid = lo & MSR_S_LO_CURRENT_FID;
-
- return 0;
-}
-
-/* the isochronous relief time */
-static void count_off_irt(struct powernow_k8_data *data)
-{
- udelay((1 << data->irt) * 10);
- return;
-}
-
-/* the voltage stabilization time */
-static void count_off_vst(struct powernow_k8_data *data)
-{
- udelay(data->vstable * VST_UNITS_20US);
- return;
-}
-
-/* need to init the control msr to a safe value (for each cpu) */
-static void fidvid_msr_init(void)
-{
- u32 lo, hi;
- u8 fid, vid;
-
- rdmsr(MSR_FIDVID_STATUS, lo, hi);
- vid = hi & MSR_S_HI_CURRENT_VID;
- fid = lo & MSR_S_LO_CURRENT_FID;
- lo = fid | (vid << MSR_C_LO_VID_SHIFT);
- hi = MSR_C_HI_STP_GNT_BENIGN;
- dprintk("cpu%d, init lo 0x%x, hi 0x%x\n", smp_processor_id(), lo, hi);
- wrmsr(MSR_FIDVID_CTL, lo, hi);
-}
-
-/* write the new fid value along with the other control fields to the msr */
-static int write_new_fid(struct powernow_k8_data *data, u32 fid)
-{
- u32 lo;
- u32 savevid = data->currvid;
- u32 i = 0;
-
- if ((fid & INVALID_FID_MASK) || (data->currvid & INVALID_VID_MASK)) {
- printk(KERN_ERR PFX "internal error - overflow on fid write\n");
- return 1;
- }
-
- lo = fid;
- lo |= (data->currvid << MSR_C_LO_VID_SHIFT);
- lo |= MSR_C_LO_INIT_FID_VID;
-
- dprintk("writing fid 0x%x, lo 0x%x, hi 0x%x\n",
- fid, lo, data->plllock * PLL_LOCK_CONVERSION);
-
- do {
- wrmsr(MSR_FIDVID_CTL, lo, data->plllock * PLL_LOCK_CONVERSION);
- if (i++ > 100) {
- printk(KERN_ERR PFX
- "Hardware error - pending bit very stuck - "
- "no further pstate changes possible\n");
- return 1;
- }
- } while (query_current_values_with_pending_wait(data));
-
- count_off_irt(data);
-
- if (savevid != data->currvid) {
- printk(KERN_ERR PFX
- "vid change on fid trans, old 0x%x, new 0x%x\n",
- savevid, data->currvid);
- return 1;
- }
-
- if (fid != data->currfid) {
- printk(KERN_ERR PFX
- "fid trans failed, fid 0x%x, curr 0x%x\n", fid,
- data->currfid);
- return 1;
- }
-
- return 0;
-}
-
-/* Write a new vid to the hardware */
-static int write_new_vid(struct powernow_k8_data *data, u32 vid)
-{
- u32 lo;
- u32 savefid = data->currfid;
- int i = 0;
-
- if ((data->currfid & INVALID_FID_MASK) || (vid & INVALID_VID_MASK)) {
- printk(KERN_ERR PFX "internal error - overflow on vid write\n");
- return 1;
- }
-
- lo = data->currfid;
- lo |= (vid << MSR_C_LO_VID_SHIFT);
- lo |= MSR_C_LO_INIT_FID_VID;
-
- dprintk("writing vid 0x%x, lo 0x%x, hi 0x%x\n",
- vid, lo, STOP_GRANT_5NS);
-
- do {
- wrmsr(MSR_FIDVID_CTL, lo, STOP_GRANT_5NS);
- if (i++ > 100) {
- printk(KERN_ERR PFX "internal error - pending bit "
- "very stuck - no further pstate "
- "changes possible\n");
- return 1;
- }
- } while (query_current_values_with_pending_wait(data));
-
- if (savefid != data->currfid) {
- printk(KERN_ERR PFX "fid changed on vid trans, old "
- "0x%x new 0x%x\n",
- savefid, data->currfid);
- return 1;
- }
-
- if (vid != data->currvid) {
- printk(KERN_ERR PFX "vid trans failed, vid 0x%x, "
- "curr 0x%x\n",
- vid, data->currvid);
- return 1;
- }
-
- return 0;
-}
-
-/*
- * Reduce the vid by the max of step or reqvid.
- * Decreasing vid codes represent increasing voltages:
- * vid of 0 is 1.550V, vid of 0x1e is 0.800V, vid of VID_OFF is off.
- */
-static int decrease_vid_code_by_step(struct powernow_k8_data *data,
- u32 reqvid, u32 step)
-{
- if ((data->currvid - reqvid) > step)
- reqvid = data->currvid - step;
-
- if (write_new_vid(data, reqvid))
- return 1;
-
- count_off_vst(data);
-
- return 0;
-}
-
-/* Change hardware pstate by single MSR write */
-static int transition_pstate(struct powernow_k8_data *data, u32 pstate)
-{
- wrmsr(MSR_PSTATE_CTRL, pstate, 0);
- data->currpstate = pstate;
- return 0;
-}
-
-/* Change Opteron/Athlon64 fid and vid, by the 3 phases. */
-static int transition_fid_vid(struct powernow_k8_data *data,
- u32 reqfid, u32 reqvid)
-{
- if (core_voltage_pre_transition(data, reqvid, reqfid))
- return 1;
-
- if (core_frequency_transition(data, reqfid))
- return 1;
-
- if (core_voltage_post_transition(data, reqvid))
- return 1;
-
- if (query_current_values_with_pending_wait(data))
- return 1;
-
- if ((reqfid != data->currfid) || (reqvid != data->currvid)) {
- printk(KERN_ERR PFX "failed (cpu%d): req 0x%x 0x%x, "
- "curr 0x%x 0x%x\n",
- smp_processor_id(),
- reqfid, reqvid, data->currfid, data->currvid);
- return 1;
- }
-
- dprintk("transitioned (cpu%d): new fid 0x%x, vid 0x%x\n",
- smp_processor_id(), data->currfid, data->currvid);
-
- return 0;
-}
-
-/* Phase 1 - core voltage transition ... setup voltage */
-static int core_voltage_pre_transition(struct powernow_k8_data *data,
- u32 reqvid, u32 reqfid)
-{
- u32 rvosteps = data->rvo;
- u32 savefid = data->currfid;
- u32 maxvid, lo, rvomult = 1;
-
- dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, "
- "reqvid 0x%x, rvo 0x%x\n",
- smp_processor_id(),
- data->currfid, data->currvid, reqvid, data->rvo);
-
- if ((savefid < LO_FID_TABLE_TOP) && (reqfid < LO_FID_TABLE_TOP))
- rvomult = 2;
- rvosteps *= rvomult;
- rdmsr(MSR_FIDVID_STATUS, lo, maxvid);
- maxvid = 0x1f & (maxvid >> 16);
- dprintk("ph1 maxvid=0x%x\n", maxvid);
- if (reqvid < maxvid) /* lower numbers are higher voltages */
- reqvid = maxvid;
-
- while (data->currvid > reqvid) {
- dprintk("ph1: curr 0x%x, req vid 0x%x\n",
- data->currvid, reqvid);
- if (decrease_vid_code_by_step(data, reqvid, data->vidmvs))
- return 1;
- }
-
- while ((rvosteps > 0) &&
- ((rvomult * data->rvo + data->currvid) > reqvid)) {
- if (data->currvid == maxvid) {
- rvosteps = 0;
- } else {
- dprintk("ph1: changing vid for rvo, req 0x%x\n",
- data->currvid - 1);
- if (decrease_vid_code_by_step(data, data->currvid-1, 1))
- return 1;
- rvosteps--;
- }
- }
-
- if (query_current_values_with_pending_wait(data))
- return 1;
-
- if (savefid != data->currfid) {
- printk(KERN_ERR PFX "ph1 err, currfid changed 0x%x\n",
- data->currfid);
- return 1;
- }
-
- dprintk("ph1 complete, currfid 0x%x, currvid 0x%x\n",
- data->currfid, data->currvid);
-
- return 0;
-}
-
-/* Phase 2 - core frequency transition */
-static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
-{
- u32 vcoreqfid, vcocurrfid, vcofiddiff;
- u32 fid_interval, savevid = data->currvid;
-
- if (data->currfid == reqfid) {
- printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n",
- data->currfid);
- return 0;
- }
-
- dprintk("ph2 (cpu%d): starting, currfid 0x%x, currvid 0x%x, "
- "reqfid 0x%x\n",
- smp_processor_id(),
- data->currfid, data->currvid, reqfid);
-
- vcoreqfid = convert_fid_to_vco_fid(reqfid);
- vcocurrfid = convert_fid_to_vco_fid(data->currfid);
- vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
- : vcoreqfid - vcocurrfid;
-
- if ((reqfid <= LO_FID_TABLE_TOP) && (data->currfid <= LO_FID_TABLE_TOP))
- vcofiddiff = 0;
-
- while (vcofiddiff > 2) {
- (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2);
-
- if (reqfid > data->currfid) {
- if (data->currfid > LO_FID_TABLE_TOP) {
- if (write_new_fid(data,
- data->currfid + fid_interval))
- return 1;
- } else {
- if (write_new_fid
- (data,
- 2 + convert_fid_to_vco_fid(data->currfid)))
- return 1;
- }
- } else {
- if (write_new_fid(data, data->currfid - fid_interval))
- return 1;
- }
-
- vcocurrfid = convert_fid_to_vco_fid(data->currfid);
- vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
- : vcoreqfid - vcocurrfid;
- }
-
- if (write_new_fid(data, reqfid))
- return 1;
-
- if (query_current_values_with_pending_wait(data))
- return 1;
-
- if (data->currfid != reqfid) {
- printk(KERN_ERR PFX
- "ph2: mismatch, failed fid transition, "
- "curr 0x%x, req 0x%x\n",
- data->currfid, reqfid);
- return 1;
- }
-
- if (savevid != data->currvid) {
- printk(KERN_ERR PFX "ph2: vid changed, save 0x%x, curr 0x%x\n",
- savevid, data->currvid);
- return 1;
- }
-
- dprintk("ph2 complete, currfid 0x%x, currvid 0x%x\n",
- data->currfid, data->currvid);
-
- return 0;
-}
-
-/* Phase 3 - core voltage transition flow ... jump to the final vid. */
-static int core_voltage_post_transition(struct powernow_k8_data *data,
- u32 reqvid)
-{
- u32 savefid = data->currfid;
- u32 savereqvid = reqvid;
-
- dprintk("ph3 (cpu%d): starting, currfid 0x%x, currvid 0x%x\n",
- smp_processor_id(),
- data->currfid, data->currvid);
-
- if (reqvid != data->currvid) {
- if (write_new_vid(data, reqvid))
- return 1;
-
- if (savefid != data->currfid) {
- printk(KERN_ERR PFX
- "ph3: bad fid change, save 0x%x, curr 0x%x\n",
- savefid, data->currfid);
- return 1;
- }
-
- if (data->currvid != reqvid) {
- printk(KERN_ERR PFX
- "ph3: failed vid transition\n, "
- "req 0x%x, curr 0x%x",
- reqvid, data->currvid);
- return 1;
- }
- }
-
- if (query_current_values_with_pending_wait(data))
- return 1;
-
- if (savereqvid != data->currvid) {
- dprintk("ph3 failed, currvid 0x%x\n", data->currvid);
- return 1;
- }
-
- if (savefid != data->currfid) {
- dprintk("ph3 failed, currfid changed 0x%x\n",
- data->currfid);
- return 1;
- }
-
- dprintk("ph3 complete, currfid 0x%x, currvid 0x%x\n",
- data->currfid, data->currvid);
-
- return 0;
-}
-
-static void check_supported_cpu(void *_rc)
-{
- u32 eax, ebx, ecx, edx;
- int *rc = _rc;
-
- *rc = -ENODEV;
-
- if (__this_cpu_read(cpu_info.x86_vendor) != X86_VENDOR_AMD)
- return;
-
- eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
- if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) &&
- ((eax & CPUID_XFAM) < CPUID_XFAM_10H))
- return;
-
- if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) {
- if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) ||
- ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) {
- printk(KERN_INFO PFX
- "Processor cpuid %x not supported\n", eax);
- return;
- }
-
- eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES);
- if (eax < CPUID_FREQ_VOLT_CAPABILITIES) {
- printk(KERN_INFO PFX
- "No frequency change capabilities detected\n");
- return;
- }
-
- cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
- if ((edx & P_STATE_TRANSITION_CAPABLE)
- != P_STATE_TRANSITION_CAPABLE) {
- printk(KERN_INFO PFX
- "Power state transitions not supported\n");
- return;
- }
- } else { /* must be a HW Pstate capable processor */
- cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
- if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE)
- cpu_family = CPU_HW_PSTATE;
- else
- return;
- }
-
- *rc = 0;
-}
-
-static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst,
- u8 maxvid)
-{
- unsigned int j;
- u8 lastfid = 0xff;
-
- for (j = 0; j < data->numps; j++) {
- if (pst[j].vid > LEAST_VID) {
- printk(KERN_ERR FW_BUG PFX "vid %d invalid : 0x%x\n",
- j, pst[j].vid);
- return -EINVAL;
- }
- if (pst[j].vid < data->rvo) {
- /* vid + rvo >= 0 */
- printk(KERN_ERR FW_BUG PFX "0 vid exceeded with pstate"
- " %d\n", j);
- return -ENODEV;
- }
- if (pst[j].vid < maxvid + data->rvo) {
- /* vid + rvo >= maxvid */
- printk(KERN_ERR FW_BUG PFX "maxvid exceeded with pstate"
- " %d\n", j);
- return -ENODEV;
- }
- if (pst[j].fid > MAX_FID) {
- printk(KERN_ERR FW_BUG PFX "maxfid exceeded with pstate"
- " %d\n", j);
- return -ENODEV;
- }
- if (j && (pst[j].fid < HI_FID_TABLE_BOTTOM)) {
- /* Only first fid is allowed to be in "low" range */
- printk(KERN_ERR FW_BUG PFX "two low fids - %d : "
- "0x%x\n", j, pst[j].fid);
- return -EINVAL;
- }
- if (pst[j].fid < lastfid)
- lastfid = pst[j].fid;
- }
- if (lastfid & 1) {
- printk(KERN_ERR FW_BUG PFX "lastfid invalid\n");
- return -EINVAL;
- }
- if (lastfid > LO_FID_TABLE_TOP)
- printk(KERN_INFO FW_BUG PFX
- "first fid not from lo freq table\n");
-
- return 0;
-}
-
-static void invalidate_entry(struct cpufreq_frequency_table *powernow_table,
- unsigned int entry)
-{
- powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
-}
-
-static void print_basics(struct powernow_k8_data *data)
-{
- int j;
- for (j = 0; j < data->numps; j++) {
- if (data->powernow_table[j].frequency !=
- CPUFREQ_ENTRY_INVALID) {
- if (cpu_family == CPU_HW_PSTATE) {
- printk(KERN_INFO PFX
- " %d : pstate %d (%d MHz)\n", j,
- data->powernow_table[j].index,
- data->powernow_table[j].frequency/1000);
- } else {
- printk(KERN_INFO PFX
- "fid 0x%x (%d MHz), vid 0x%x\n",
- data->powernow_table[j].index & 0xff,
- data->powernow_table[j].frequency/1000,
- data->powernow_table[j].index >> 8);
- }
- }
- }
- if (data->batps)
- printk(KERN_INFO PFX "Only %d pstates on battery\n",
- data->batps);
-}
-
-static u32 freq_from_fid_did(u32 fid, u32 did)
-{
- u32 mhz = 0;
-
- if (boot_cpu_data.x86 == 0x10)
- mhz = (100 * (fid + 0x10)) >> did;
- else if (boot_cpu_data.x86 == 0x11)
- mhz = (100 * (fid + 8)) >> did;
- else
- BUG();
-
- return mhz * 1000;
-}
-
-static int fill_powernow_table(struct powernow_k8_data *data,
- struct pst_s *pst, u8 maxvid)
-{
- struct cpufreq_frequency_table *powernow_table;
- unsigned int j;
-
- if (data->batps) {
- /* use ACPI support to get full speed on mains power */
- printk(KERN_WARNING PFX
- "Only %d pstates usable (use ACPI driver for full "
- "range\n", data->batps);
- data->numps = data->batps;
- }
-
- for (j = 1; j < data->numps; j++) {
- if (pst[j-1].fid >= pst[j].fid) {
- printk(KERN_ERR PFX "PST out of sequence\n");
- return -EINVAL;
- }
- }
-
- if (data->numps < 2) {
- printk(KERN_ERR PFX "no p states to transition\n");
- return -ENODEV;
- }
-
- if (check_pst_table(data, pst, maxvid))
- return -EINVAL;
-
- powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
- * (data->numps + 1)), GFP_KERNEL);
- if (!powernow_table) {
- printk(KERN_ERR PFX "powernow_table memory alloc failure\n");
- return -ENOMEM;
- }
-
- for (j = 0; j < data->numps; j++) {
- int freq;
- powernow_table[j].index = pst[j].fid; /* lower 8 bits */
- powernow_table[j].index |= (pst[j].vid << 8); /* upper 8 bits */
- freq = find_khz_freq_from_fid(pst[j].fid);
- powernow_table[j].frequency = freq;
- }
- powernow_table[data->numps].frequency = CPUFREQ_TABLE_END;
- powernow_table[data->numps].index = 0;
-
- if (query_current_values_with_pending_wait(data)) {
- kfree(powernow_table);
- return -EIO;
- }
-
- dprintk("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid);
- data->powernow_table = powernow_table;
- if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
- print_basics(data);
-
- for (j = 0; j < data->numps; j++)
- if ((pst[j].fid == data->currfid) &&
- (pst[j].vid == data->currvid))
- return 0;
-
- dprintk("currfid/vid do not match PST, ignoring\n");
- return 0;
-}
-
-/* Find and validate the PSB/PST table in BIOS. */
-static int find_psb_table(struct powernow_k8_data *data)
-{
- struct psb_s *psb;
- unsigned int i;
- u32 mvs;
- u8 maxvid;
- u32 cpst = 0;
- u32 thiscpuid;
-
- for (i = 0xc0000; i < 0xffff0; i += 0x10) {
- /* Scan BIOS looking for the signature. */
- /* It can not be at ffff0 - it is too big. */
-
- psb = phys_to_virt(i);
- if (memcmp(psb, PSB_ID_STRING, PSB_ID_STRING_LEN) != 0)
- continue;
-
- dprintk("found PSB header at 0x%p\n", psb);
-
- dprintk("table vers: 0x%x\n", psb->tableversion);
- if (psb->tableversion != PSB_VERSION_1_4) {
- printk(KERN_ERR FW_BUG PFX "PSB table is not v1.4\n");
- return -ENODEV;
- }
-
- dprintk("flags: 0x%x\n", psb->flags1);
- if (psb->flags1) {
- printk(KERN_ERR FW_BUG PFX "unknown flags\n");
- return -ENODEV;
- }
-
- data->vstable = psb->vstable;
- dprintk("voltage stabilization time: %d(*20us)\n",
- data->vstable);
-
- dprintk("flags2: 0x%x\n", psb->flags2);
- data->rvo = psb->flags2 & 3;
- data->irt = ((psb->flags2) >> 2) & 3;
- mvs = ((psb->flags2) >> 4) & 3;
- data->vidmvs = 1 << mvs;
- data->batps = ((psb->flags2) >> 6) & 3;
-
- dprintk("ramp voltage offset: %d\n", data->rvo);
- dprintk("isochronous relief time: %d\n", data->irt);
- dprintk("maximum voltage step: %d - 0x%x\n", mvs, data->vidmvs);
-
- dprintk("numpst: 0x%x\n", psb->num_tables);
- cpst = psb->num_tables;
- if ((psb->cpuid == 0x00000fc0) ||
- (psb->cpuid == 0x00000fe0)) {
- thiscpuid = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
- if ((thiscpuid == 0x00000fc0) ||
- (thiscpuid == 0x00000fe0))
- cpst = 1;
- }
- if (cpst != 1) {
- printk(KERN_ERR FW_BUG PFX "numpst must be 1\n");
- return -ENODEV;
- }
-
- data->plllock = psb->plllocktime;
- dprintk("plllocktime: 0x%x (units 1us)\n", psb->plllocktime);
- dprintk("maxfid: 0x%x\n", psb->maxfid);
- dprintk("maxvid: 0x%x\n", psb->maxvid);
- maxvid = psb->maxvid;
-
- data->numps = psb->numps;
- dprintk("numpstates: 0x%x\n", data->numps);
- return fill_powernow_table(data,
- (struct pst_s *)(psb+1), maxvid);
- }
- /*
- * If you see this message, complain to BIOS manufacturer. If
- * he tells you "we do not support Linux" or some similar
- * nonsense, remember that Windows 2000 uses the same legacy
- * mechanism that the old Linux PSB driver uses. Tell them it
- * is broken with Windows 2000.
- *
- * The reference to the AMD documentation is chapter 9 in the
- * BIOS and Kernel Developer's Guide, which is available on
- * www.amd.com
- */
- printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n");
- printk(KERN_ERR PFX "Make sure that your BIOS is up to date"
- " and Cool'N'Quiet support is enabled in BIOS setup\n");
- return -ENODEV;
-}
-
-static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data,
- unsigned int index)
-{
- u64 control;
-
- if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))
- return;
-
- control = data->acpi_data.states[index].control;
- data->irt = (control >> IRT_SHIFT) & IRT_MASK;
- data->rvo = (control >> RVO_SHIFT) & RVO_MASK;
- data->exttype = (control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK;
- data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK;
- data->vidmvs = 1 << ((control >> MVS_SHIFT) & MVS_MASK);
- data->vstable = (control >> VST_SHIFT) & VST_MASK;
-}
-
-static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
-{
- struct cpufreq_frequency_table *powernow_table;
- int ret_val = -ENODEV;
- u64 control, status;
-
- if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
- dprintk("register performance failed: bad ACPI data\n");
- return -EIO;
- }
-
- /* verify the data contained in the ACPI structures */
- if (data->acpi_data.state_count <= 1) {
- dprintk("No ACPI P-States\n");
- goto err_out;
- }
-
- control = data->acpi_data.control_register.space_id;
- status = data->acpi_data.status_register.space_id;
-
- if ((control != ACPI_ADR_SPACE_FIXED_HARDWARE) ||
- (status != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
- dprintk("Invalid control/status registers (%x - %x)\n",
- control, status);
- goto err_out;
- }
-
- /* fill in data->powernow_table */
- powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
- * (data->acpi_data.state_count + 1)), GFP_KERNEL);
- if (!powernow_table) {
- dprintk("powernow_table memory alloc failure\n");
- goto err_out;
- }
-
- /* fill in data */
- data->numps = data->acpi_data.state_count;
- powernow_k8_acpi_pst_values(data, 0);
-
- if (cpu_family == CPU_HW_PSTATE)
- ret_val = fill_powernow_table_pstate(data, powernow_table);
- else
- ret_val = fill_powernow_table_fidvid(data, powernow_table);
- if (ret_val)
- goto err_out_mem;
-
- powernow_table[data->acpi_data.state_count].frequency =
- CPUFREQ_TABLE_END;
- powernow_table[data->acpi_data.state_count].index = 0;
- data->powernow_table = powernow_table;
-
- if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
- print_basics(data);
-
- /* notify BIOS that we exist */
- acpi_processor_notify_smm(THIS_MODULE);
-
- if (!zalloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) {
- printk(KERN_ERR PFX
- "unable to alloc powernow_k8_data cpumask\n");
- ret_val = -ENOMEM;
- goto err_out_mem;
- }
-
- return 0;
-
-err_out_mem:
- kfree(powernow_table);
-
-err_out:
- acpi_processor_unregister_performance(&data->acpi_data, data->cpu);
-
- /* data->acpi_data.state_count informs us at ->exit()
- * whether ACPI was used */
- data->acpi_data.state_count = 0;
-
- return ret_val;
-}
-
-static int fill_powernow_table_pstate(struct powernow_k8_data *data,
- struct cpufreq_frequency_table *powernow_table)
-{
- int i;
- u32 hi = 0, lo = 0;
- rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi);
- data->max_hw_pstate = (lo & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
-
- for (i = 0; i < data->acpi_data.state_count; i++) {
- u32 index;
-
- index = data->acpi_data.states[i].control & HW_PSTATE_MASK;
- if (index > data->max_hw_pstate) {
- printk(KERN_ERR PFX "invalid pstate %d - "
- "bad value %d.\n", i, index);
- printk(KERN_ERR PFX "Please report to BIOS "
- "manufacturer\n");
- invalidate_entry(powernow_table, i);
- continue;
- }
- rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi);
- if (!(hi & HW_PSTATE_VALID_MASK)) {
- dprintk("invalid pstate %d, ignoring\n", index);
- invalidate_entry(powernow_table, i);
- continue;
- }
-
- powernow_table[i].index = index;
-
- /* Frequency may be rounded for these */
- if ((boot_cpu_data.x86 == 0x10 && boot_cpu_data.x86_model < 10)
- || boot_cpu_data.x86 == 0x11) {
- powernow_table[i].frequency =
- freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7);
- } else
- powernow_table[i].frequency =
- data->acpi_data.states[i].core_frequency * 1000;
- }
- return 0;
-}
-
-static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
- struct cpufreq_frequency_table *powernow_table)
-{
- int i;
-
- for (i = 0; i < data->acpi_data.state_count; i++) {
- u32 fid;
- u32 vid;
- u32 freq, index;
- u64 status, control;
-
- if (data->exttype) {
- status = data->acpi_data.states[i].status;
- fid = status & EXT_FID_MASK;
- vid = (status >> VID_SHIFT) & EXT_VID_MASK;
- } else {
- control = data->acpi_data.states[i].control;
- fid = control & FID_MASK;
- vid = (control >> VID_SHIFT) & VID_MASK;
- }
-
- dprintk(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid);
-
- index = fid | (vid<<8);
- powernow_table[i].index = index;
-
- freq = find_khz_freq_from_fid(fid);
- powernow_table[i].frequency = freq;
-
- /* verify frequency is OK */
- if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) {
- dprintk("invalid freq %u kHz, ignoring\n", freq);
- invalidate_entry(powernow_table, i);
- continue;
- }
-
- /* verify voltage is OK -
- * BIOSs are using "off" to indicate invalid */
- if (vid == VID_OFF) {
- dprintk("invalid vid %u, ignoring\n", vid);
- invalidate_entry(powernow_table, i);
- continue;
- }
-
- if (freq != (data->acpi_data.states[i].core_frequency * 1000)) {
- printk(KERN_INFO PFX "invalid freq entries "
- "%u kHz vs. %u kHz\n", freq,
- (unsigned int)
- (data->acpi_data.states[i].core_frequency
- * 1000));
- invalidate_entry(powernow_table, i);
- continue;
- }
- }
- return 0;
-}
-
-static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data)
-{
- if (data->acpi_data.state_count)
- acpi_processor_unregister_performance(&data->acpi_data,
- data->cpu);
- free_cpumask_var(data->acpi_data.shared_cpu_map);
-}
-
-static int get_transition_latency(struct powernow_k8_data *data)
-{
- int max_latency = 0;
- int i;
- for (i = 0; i < data->acpi_data.state_count; i++) {
- int cur_latency = data->acpi_data.states[i].transition_latency
- + data->acpi_data.states[i].bus_master_latency;
- if (cur_latency > max_latency)
- max_latency = cur_latency;
- }
- if (max_latency == 0) {
- /*
- * Fam 11h and later may return 0 as transition latency. This
- * is intended and means "very fast". While cpufreq core and
- * governors currently can handle that gracefully, better set it
- * to 1 to avoid problems in the future.
- */
- if (boot_cpu_data.x86 < 0x11)
- printk(KERN_ERR FW_WARN PFX "Invalid zero transition "
- "latency\n");
- max_latency = 1;
- }
- /* value in usecs, needs to be in nanoseconds */
- return 1000 * max_latency;
-}
-
-/* Take a frequency, and issue the fid/vid transition command */
-static int transition_frequency_fidvid(struct powernow_k8_data *data,
- unsigned int index)
-{
- u32 fid = 0;
- u32 vid = 0;
- int res, i;
- struct cpufreq_freqs freqs;
-
- dprintk("cpu %d transition to index %u\n", smp_processor_id(), index);
-
- /* fid/vid correctness check for k8 */
- /* fid are the lower 8 bits of the index we stored into
- * the cpufreq frequency table in find_psb_table, vid
- * are the upper 8 bits.
- */
- fid = data->powernow_table[index].index & 0xFF;
- vid = (data->powernow_table[index].index & 0xFF00) >> 8;
-
- dprintk("table matched fid 0x%x, giving vid 0x%x\n", fid, vid);
-
- if (query_current_values_with_pending_wait(data))
- return 1;
-
- if ((data->currvid == vid) && (data->currfid == fid)) {
- dprintk("target matches current values (fid 0x%x, vid 0x%x)\n",
- fid, vid);
- return 0;
- }
-
- dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n",
- smp_processor_id(), fid, vid);
- freqs.old = find_khz_freq_from_fid(data->currfid);
- freqs.new = find_khz_freq_from_fid(fid);
-
- for_each_cpu(i, data->available_cores) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
- }
-
- res = transition_fid_vid(data, fid, vid);
- freqs.new = find_khz_freq_from_fid(data->currfid);
-
- for_each_cpu(i, data->available_cores) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- }
- return res;
-}
-
-/* Take a frequency, and issue the hardware pstate transition command */
-static int transition_frequency_pstate(struct powernow_k8_data *data,
- unsigned int index)
-{
- u32 pstate = 0;
- int res, i;
- struct cpufreq_freqs freqs;
-
- dprintk("cpu %d transition to index %u\n", smp_processor_id(), index);
-
- /* get MSR index for hardware pstate transition */
- pstate = index & HW_PSTATE_MASK;
- if (pstate > data->max_hw_pstate)
- return 0;
- freqs.old = find_khz_freq_from_pstate(data->powernow_table,
- data->currpstate);
- freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
-
- for_each_cpu(i, data->available_cores) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
- }
-
- res = transition_pstate(data, pstate);
- freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
-
- for_each_cpu(i, data->available_cores) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- }
- return res;
-}
-
-/* Driver entry point to switch to the target frequency */
-static int powernowk8_target(struct cpufreq_policy *pol,
- unsigned targfreq, unsigned relation)
-{
- cpumask_var_t oldmask;
- struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
- u32 checkfid;
- u32 checkvid;
- unsigned int newstate;
- int ret = -EIO;
-
- if (!data)
- return -EINVAL;
-
- checkfid = data->currfid;
- checkvid = data->currvid;
-
- /* only run on specific CPU from here on. */
- /* This is poor form: use a workqueue or smp_call_function_single */
- if (!alloc_cpumask_var(&oldmask, GFP_KERNEL))
- return -ENOMEM;
-
- cpumask_copy(oldmask, tsk_cpus_allowed(current));
- set_cpus_allowed_ptr(current, cpumask_of(pol->cpu));
-
- if (smp_processor_id() != pol->cpu) {
- printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
- goto err_out;
- }
-
- if (pending_bit_stuck()) {
- printk(KERN_ERR PFX "failing targ, change pending bit set\n");
- goto err_out;
- }
-
- dprintk("targ: cpu %d, %d kHz, min %d, max %d, relation %d\n",
- pol->cpu, targfreq, pol->min, pol->max, relation);
-
- if (query_current_values_with_pending_wait(data))
- goto err_out;
-
- if (cpu_family != CPU_HW_PSTATE) {
- dprintk("targ: curr fid 0x%x, vid 0x%x\n",
- data->currfid, data->currvid);
-
- if ((checkvid != data->currvid) ||
- (checkfid != data->currfid)) {
- printk(KERN_INFO PFX
- "error - out of sync, fix 0x%x 0x%x, "
- "vid 0x%x 0x%x\n",
- checkfid, data->currfid,
- checkvid, data->currvid);
- }
- }
-
- if (cpufreq_frequency_table_target(pol, data->powernow_table,
- targfreq, relation, &newstate))
- goto err_out;
-
- mutex_lock(&fidvid_mutex);
-
- powernow_k8_acpi_pst_values(data, newstate);
-
- if (cpu_family == CPU_HW_PSTATE)
- ret = transition_frequency_pstate(data, newstate);
- else
- ret = transition_frequency_fidvid(data, newstate);
- if (ret) {
- printk(KERN_ERR PFX "transition frequency failed\n");
- ret = 1;
- mutex_unlock(&fidvid_mutex);
- goto err_out;
- }
- mutex_unlock(&fidvid_mutex);
-
- if (cpu_family == CPU_HW_PSTATE)
- pol->cur = find_khz_freq_from_pstate(data->powernow_table,
- newstate);
- else
- pol->cur = find_khz_freq_from_fid(data->currfid);
- ret = 0;
-
-err_out:
- set_cpus_allowed_ptr(current, oldmask);
- free_cpumask_var(oldmask);
- return ret;
-}
-
-/* Driver entry point to verify the policy and range of frequencies */
-static int powernowk8_verify(struct cpufreq_policy *pol)
-{
- struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
-
- if (!data)
- return -EINVAL;
-
- return cpufreq_frequency_table_verify(pol, data->powernow_table);
-}
-
-struct init_on_cpu {
- struct powernow_k8_data *data;
- int rc;
-};
-
-static void __cpuinit powernowk8_cpu_init_on_cpu(void *_init_on_cpu)
-{
- struct init_on_cpu *init_on_cpu = _init_on_cpu;
-
- if (pending_bit_stuck()) {
- printk(KERN_ERR PFX "failing init, change pending bit set\n");
- init_on_cpu->rc = -ENODEV;
- return;
- }
-
- if (query_current_values_with_pending_wait(init_on_cpu->data)) {
- init_on_cpu->rc = -ENODEV;
- return;
- }
-
- if (cpu_family == CPU_OPTERON)
- fidvid_msr_init();
-
- init_on_cpu->rc = 0;
-}
-
-/* per CPU init entry point to the driver */
-static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
-{
- static const char ACPI_PSS_BIOS_BUG_MSG[] =
- KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n"
- FW_BUG PFX "Try again with latest BIOS.\n";
- struct powernow_k8_data *data;
- struct init_on_cpu init_on_cpu;
- int rc;
- struct cpuinfo_x86 *c = &cpu_data(pol->cpu);
-
- if (!cpu_online(pol->cpu))
- return -ENODEV;
-
- smp_call_function_single(pol->cpu, check_supported_cpu, &rc, 1);
- if (rc)
- return -ENODEV;
-
- data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL);
- if (!data) {
- printk(KERN_ERR PFX "unable to alloc powernow_k8_data");
- return -ENOMEM;
- }
-
- data->cpu = pol->cpu;
- data->currpstate = HW_PSTATE_INVALID;
-
- if (powernow_k8_cpu_init_acpi(data)) {
- /*
- * Use the PSB BIOS structure. This is only available on
- * an UP version, and is deprecated by AMD.
- */
- if (num_online_cpus() != 1) {
- printk_once(ACPI_PSS_BIOS_BUG_MSG);
- goto err_out;
- }
- if (pol->cpu != 0) {
- printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for "
- "CPU other than CPU0. Complain to your BIOS "
- "vendor.\n");
- goto err_out;
- }
- rc = find_psb_table(data);
- if (rc)
- goto err_out;
-
- /* Take a crude guess here.
- * That guess was in microseconds, so multiply with 1000 */
- pol->cpuinfo.transition_latency = (
- ((data->rvo + 8) * data->vstable * VST_UNITS_20US) +
- ((1 << data->irt) * 30)) * 1000;
- } else /* ACPI _PSS objects available */
- pol->cpuinfo.transition_latency = get_transition_latency(data);
-
- /* only run on specific CPU from here on */
- init_on_cpu.data = data;
- smp_call_function_single(data->cpu, powernowk8_cpu_init_on_cpu,
- &init_on_cpu, 1);
- rc = init_on_cpu.rc;
- if (rc != 0)
- goto err_out_exit_acpi;
-
- if (cpu_family == CPU_HW_PSTATE)
- cpumask_copy(pol->cpus, cpumask_of(pol->cpu));
- else
- cpumask_copy(pol->cpus, cpu_core_mask(pol->cpu));
- data->available_cores = pol->cpus;
-
- if (cpu_family == CPU_HW_PSTATE)
- pol->cur = find_khz_freq_from_pstate(data->powernow_table,
- data->currpstate);
- else
- pol->cur = find_khz_freq_from_fid(data->currfid);
- dprintk("policy current frequency %d kHz\n", pol->cur);
-
- /* min/max the cpu is capable of */
- if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) {
- printk(KERN_ERR FW_BUG PFX "invalid powernow_table\n");
- powernow_k8_cpu_exit_acpi(data);
- kfree(data->powernow_table);
- kfree(data);
- return -EINVAL;
- }
-
- /* Check for APERF/MPERF support in hardware */
- if (cpu_has(c, X86_FEATURE_APERFMPERF))
- cpufreq_amd64_driver.getavg = cpufreq_get_measured_perf;
-
- cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);
-
- if (cpu_family == CPU_HW_PSTATE)
- dprintk("cpu_init done, current pstate 0x%x\n",
- data->currpstate);
- else
- dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n",
- data->currfid, data->currvid);
-
- per_cpu(powernow_data, pol->cpu) = data;
-
- return 0;
-
-err_out_exit_acpi:
- powernow_k8_cpu_exit_acpi(data);
-
-err_out:
- kfree(data);
- return -ENODEV;
-}
-
-static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)
-{
- struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
-
- if (!data)
- return -EINVAL;
-
- powernow_k8_cpu_exit_acpi(data);
-
- cpufreq_frequency_table_put_attr(pol->cpu);
-
- kfree(data->powernow_table);
- kfree(data);
- per_cpu(powernow_data, pol->cpu) = NULL;
-
- return 0;
-}
-
-static void query_values_on_cpu(void *_err)
-{
- int *err = _err;
- struct powernow_k8_data *data = __this_cpu_read(powernow_data);
-
- *err = query_current_values_with_pending_wait(data);
-}
-
-static unsigned int powernowk8_get(unsigned int cpu)
-{
- struct powernow_k8_data *data = per_cpu(powernow_data, cpu);
- unsigned int khz = 0;
- int err;
-
- if (!data)
- return 0;
-
- smp_call_function_single(cpu, query_values_on_cpu, &err, true);
- if (err)
- goto out;
-
- if (cpu_family == CPU_HW_PSTATE)
- khz = find_khz_freq_from_pstate(data->powernow_table,
- data->currpstate);
- else
- khz = find_khz_freq_from_fid(data->currfid);
-
-
-out:
- return khz;
-}
-
-static void _cpb_toggle_msrs(bool t)
-{
- int cpu;
-
- get_online_cpus();
-
- rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
-
- for_each_cpu(cpu, cpu_online_mask) {
- struct msr *reg = per_cpu_ptr(msrs, cpu);
- if (t)
- reg->l &= ~BIT(25);
- else
- reg->l |= BIT(25);
- }
- wrmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
-
- put_online_cpus();
-}
-
-/*
- * Switch on/off core performance boosting.
- *
- * 0=disable
- * 1=enable.
- */
-static void cpb_toggle(bool t)
-{
- if (!cpb_capable)
- return;
-
- if (t && !cpb_enabled) {
- cpb_enabled = true;
- _cpb_toggle_msrs(t);
- printk(KERN_INFO PFX "Core Boosting enabled.\n");
- } else if (!t && cpb_enabled) {
- cpb_enabled = false;
- _cpb_toggle_msrs(t);
- printk(KERN_INFO PFX "Core Boosting disabled.\n");
- }
-}
-
-static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf,
- size_t count)
-{
- int ret = -EINVAL;
- unsigned long val = 0;
-
- ret = strict_strtoul(buf, 10, &val);
- if (!ret && (val == 0 || val == 1) && cpb_capable)
- cpb_toggle(val);
- else
- return -EINVAL;
-
- return count;
-}
-
-static ssize_t show_cpb(struct cpufreq_policy *policy, char *buf)
-{
- return sprintf(buf, "%u\n", cpb_enabled);
-}
-
-#define define_one_rw(_name) \
-static struct freq_attr _name = \
-__ATTR(_name, 0644, show_##_name, store_##_name)
-
-define_one_rw(cpb);
-
-static struct freq_attr *powernow_k8_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- &cpb,
- NULL,
-};
-
-static struct cpufreq_driver cpufreq_amd64_driver = {
- .verify = powernowk8_verify,
- .target = powernowk8_target,
- .bios_limit = acpi_processor_get_bios_limit,
- .init = powernowk8_cpu_init,
- .exit = __devexit_p(powernowk8_cpu_exit),
- .get = powernowk8_get,
- .name = "powernow-k8",
- .owner = THIS_MODULE,
- .attr = powernow_k8_attr,
-};
-
-/*
- * Clear the boost-disable flag on the CPU_DOWN path so that this cpu
- * cannot block the remaining ones from boosting. On the CPU_UP path we
- * simply keep the boost-disable flag in sync with the current global
- * state.
- */
-static int cpb_notify(struct notifier_block *nb, unsigned long action,
- void *hcpu)
-{
- unsigned cpu = (long)hcpu;
- u32 lo, hi;
-
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
-
- if (!cpb_enabled) {
- rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
- lo |= BIT(25);
- wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
- }
- break;
-
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
- lo &= ~BIT(25);
- wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
- break;
-
- default:
- break;
- }
-
- return NOTIFY_OK;
-}
-
-static struct notifier_block cpb_nb = {
- .notifier_call = cpb_notify,
-};
-
-/* driver entry point for init */
-static int __cpuinit powernowk8_init(void)
-{
- unsigned int i, supported_cpus = 0, cpu;
- int rv;
-
- for_each_online_cpu(i) {
- int rc;
- smp_call_function_single(i, check_supported_cpu, &rc, 1);
- if (rc == 0)
- supported_cpus++;
- }
-
- if (supported_cpus != num_online_cpus())
- return -ENODEV;
-
- printk(KERN_INFO PFX "Found %d %s (%d cpu cores) (" VERSION ")\n",
- num_online_nodes(), boot_cpu_data.x86_model_id, supported_cpus);
-
- if (boot_cpu_has(X86_FEATURE_CPB)) {
-
- cpb_capable = true;
-
- msrs = msrs_alloc();
- if (!msrs) {
- printk(KERN_ERR "%s: Error allocating msrs!\n", __func__);
- return -ENOMEM;
- }
-
- register_cpu_notifier(&cpb_nb);
-
- rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
-
- for_each_cpu(cpu, cpu_online_mask) {
- struct msr *reg = per_cpu_ptr(msrs, cpu);
- cpb_enabled |= !(!!(reg->l & BIT(25)));
- }
-
- printk(KERN_INFO PFX "Core Performance Boosting: %s.\n",
- (cpb_enabled ? "on" : "off"));
- }
-
- rv = cpufreq_register_driver(&cpufreq_amd64_driver);
- if (rv < 0 && boot_cpu_has(X86_FEATURE_CPB)) {
- unregister_cpu_notifier(&cpb_nb);
- msrs_free(msrs);
- msrs = NULL;
- }
- return rv;
-}
-
-/* driver entry point for term */
-static void __exit powernowk8_exit(void)
-{
- dprintk("exit\n");
-
- if (boot_cpu_has(X86_FEATURE_CPB)) {
- msrs_free(msrs);
- msrs = NULL;
-
- unregister_cpu_notifier(&cpb_nb);
- }
-
- cpufreq_unregister_driver(&cpufreq_amd64_driver);
-}
-
-MODULE_AUTHOR("Paul Devriendt <paul.devriendt@amd.com> and "
- "Mark Langsdorf <mark.langsdorf@amd.com>");
-MODULE_DESCRIPTION("AMD Athlon 64 and Opteron processor frequency driver.");
-MODULE_LICENSE("GPL");
-
-late_initcall(powernowk8_init);
-module_exit(powernowk8_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
deleted file mode 100644
index df3529b1c02..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * (c) 2003-2006 Advanced Micro Devices, Inc.
- * Your use of this code is subject to the terms and conditions of the
- * GNU general public license version 2. See "COPYING" or
- * http://www.gnu.org/licenses/gpl.html
- */
-
-enum pstate {
- HW_PSTATE_INVALID = 0xff,
- HW_PSTATE_0 = 0,
- HW_PSTATE_1 = 1,
- HW_PSTATE_2 = 2,
- HW_PSTATE_3 = 3,
- HW_PSTATE_4 = 4,
- HW_PSTATE_5 = 5,
- HW_PSTATE_6 = 6,
- HW_PSTATE_7 = 7,
-};
-
-struct powernow_k8_data {
- unsigned int cpu;
-
- u32 numps; /* number of p-states */
- u32 batps; /* number of p-states supported on battery */
- u32 max_hw_pstate; /* maximum legal hardware pstate */
-
- /* these values are constant when the PSB is used to determine
- * vid/fid pairings, but are modified during the ->target() call
- * when ACPI is used */
- u32 rvo; /* ramp voltage offset */
- u32 irt; /* isochronous relief time */
- u32 vidmvs; /* usable value calculated from mvs */
- u32 vstable; /* voltage stabilization time, units 20 us */
- u32 plllock; /* pll lock time, units 1 us */
- u32 exttype; /* extended interface = 1 */
-
- /* keep track of the current fid / vid or pstate */
- u32 currvid;
- u32 currfid;
- enum pstate currpstate;
-
- /* the powernow_table includes all frequency and vid/fid pairings:
- * fid are the lower 8 bits of the index, vid are the upper 8 bits.
- * frequency is in kHz */
- struct cpufreq_frequency_table *powernow_table;
-
- /* the acpi table needs to be kept. it's only available if ACPI was
- * used to determine valid frequency/vid/fid states */
- struct acpi_processor_performance acpi_data;
-
- /* we need to keep track of associated cores, but let cpufreq
- * handle hotplug events - so just point at cpufreq pol->cpus
- * structure */
- struct cpumask *available_cores;
-};
-
-/* processor's cpuid instruction support */
-#define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */
-#define CPUID_XFAM 0x0ff00000 /* extended family */
-#define CPUID_XFAM_K8 0
-#define CPUID_XMOD 0x000f0000 /* extended model */
-#define CPUID_XMOD_REV_MASK 0x000c0000
-#define CPUID_XFAM_10H 0x00100000 /* family 0x10 */
-#define CPUID_USE_XFAM_XMOD 0x00000f00
-#define CPUID_GET_MAX_CAPABILITIES 0x80000000
-#define CPUID_FREQ_VOLT_CAPABILITIES 0x80000007
-#define P_STATE_TRANSITION_CAPABLE 6
-
-/* Model Specific Registers for p-state transitions. MSRs are 64-bit. For */
-/* writes (wrmsr - opcode 0f 30), the register number is placed in ecx, and */
-/* the value to write is placed in edx:eax. For reads (rdmsr - opcode 0f 32), */
-/* the register number is placed in ecx, and the data is returned in edx:eax. */
-
-#define MSR_FIDVID_CTL 0xc0010041
-#define MSR_FIDVID_STATUS 0xc0010042
-
-/* Field definitions within the FID VID Low Control MSR : */
-#define MSR_C_LO_INIT_FID_VID 0x00010000
-#define MSR_C_LO_NEW_VID 0x00003f00
-#define MSR_C_LO_NEW_FID 0x0000003f
-#define MSR_C_LO_VID_SHIFT 8
-
-/* Field definitions within the FID VID High Control MSR : */
-#define MSR_C_HI_STP_GNT_TO 0x000fffff
-
-/* Field definitions within the FID VID Low Status MSR : */
-#define MSR_S_LO_CHANGE_PENDING 0x80000000 /* cleared when completed */
-#define MSR_S_LO_MAX_RAMP_VID 0x3f000000
-#define MSR_S_LO_MAX_FID 0x003f0000
-#define MSR_S_LO_START_FID 0x00003f00
-#define MSR_S_LO_CURRENT_FID 0x0000003f
-
-/* Field definitions within the FID VID High Status MSR : */
-#define MSR_S_HI_MIN_WORKING_VID 0x3f000000
-#define MSR_S_HI_MAX_WORKING_VID 0x003f0000
-#define MSR_S_HI_START_VID 0x00003f00
-#define MSR_S_HI_CURRENT_VID 0x0000003f
-#define MSR_C_HI_STP_GNT_BENIGN 0x00000001
-
-
-/* Hardware Pstate _PSS and MSR definitions */
-#define USE_HW_PSTATE 0x00000080
-#define HW_PSTATE_MASK 0x00000007
-#define HW_PSTATE_VALID_MASK 0x80000000
-#define HW_PSTATE_MAX_MASK 0x000000f0
-#define HW_PSTATE_MAX_SHIFT 4
-#define MSR_PSTATE_DEF_BASE 0xc0010064 /* base of Pstate MSRs */
-#define MSR_PSTATE_STATUS 0xc0010063 /* Pstate Status MSR */
-#define MSR_PSTATE_CTRL 0xc0010062 /* Pstate control MSR */
-#define MSR_PSTATE_CUR_LIMIT 0xc0010061 /* pstate current limit MSR */
-
-/* define the two driver architectures */
-#define CPU_OPTERON 0
-#define CPU_HW_PSTATE 1
-
-
-/*
- * There are restrictions frequencies have to follow:
- * - only 1 entry in the low fid table ( <=1.4GHz )
- * - lowest entry in the high fid table must be >= 2 * the entry in the
- * low fid table
- * - lowest entry in the high fid table must be a <= 200MHz + 2 * the entry
- * in the low fid table
- * - the parts can only step at <= 200 MHz intervals, odd fid values are
- * supported in revision G and later revisions.
- * - lowest frequency must be >= interprocessor hypertransport link speed
- * (only applies to MP systems obviously)
- */
-
-/* fids (frequency identifiers) are arranged in 2 tables - lo and hi */
-#define LO_FID_TABLE_TOP 7 /* fid values marking the boundary */
-#define HI_FID_TABLE_BOTTOM 8 /* between the low and high tables */
-
-#define LO_VCOFREQ_TABLE_TOP 1400 /* corresponding vco frequency values */
-#define HI_VCOFREQ_TABLE_BOTTOM 1600
-
-#define MIN_FREQ_RESOLUTION 200 /* fids jump by 2 matching freq jumps by 200 */
-
-#define MAX_FID 0x2a /* Spec only gives FID values as far as 5 GHz */
-#define LEAST_VID 0x3e /* Lowest (numerically highest) useful vid value */
-
-#define MIN_FREQ 800 /* Min and max freqs, per spec */
-#define MAX_FREQ 5000
-
-#define INVALID_FID_MASK 0xffffffc0 /* not a valid fid if these bits are set */
-#define INVALID_VID_MASK 0xffffffc0 /* not a valid vid if these bits are set */
-
-#define VID_OFF 0x3f
-
-#define STOP_GRANT_5NS 1 /* min poss memory access latency for voltage change */
-
-#define PLL_LOCK_CONVERSION (1000/5) /* ms to ns, then divide by clock period */
-
-#define MAXIMUM_VID_STEPS 1 /* Current cpus only allow a single step of 25mV */
-#define VST_UNITS_20US 20 /* Voltage Stabilization Time is in units of 20us */
-
-/*
- * Most values of interest are encoded in a single field of the _PSS
- * entries: the "control" value.
- */
-
-#define IRT_SHIFT 30
-#define RVO_SHIFT 28
-#define EXT_TYPE_SHIFT 27
-#define PLL_L_SHIFT 20
-#define MVS_SHIFT 18
-#define VST_SHIFT 11
-#define VID_SHIFT 6
-#define IRT_MASK 3
-#define RVO_MASK 3
-#define EXT_TYPE_MASK 1
-#define PLL_L_MASK 0x7f
-#define MVS_MASK 3
-#define VST_MASK 0x7f
-#define VID_MASK 0x1f
-#define FID_MASK 0x1f
-#define EXT_VID_MASK 0x3f
-#define EXT_FID_MASK 0x3f
-
-
-/*
- * Version 1.4 of the PSB table. This table is constructed by BIOS and is
- * to tell the OS's power management driver which VIDs and FIDs are
- * supported by this particular processor.
- * If the data in the PSB / PST is wrong, then this driver will program the
- * wrong values into hardware, which is very likely to lead to a crash.
- */
-
-#define PSB_ID_STRING "AMDK7PNOW!"
-#define PSB_ID_STRING_LEN 10
-
-#define PSB_VERSION_1_4 0x14
-
-struct psb_s {
- u8 signature[10];
- u8 tableversion;
- u8 flags1;
- u16 vstable;
- u8 flags2;
- u8 num_tables;
- u32 cpuid;
- u8 plllocktime;
- u8 maxfid;
- u8 maxvid;
- u8 numps;
-};
-
-/* Pairs of fid/vid values are appended to the version 1.4 PSB table. */
-struct pst_s {
- u8 fid;
- u8 vid;
-};
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg)
-
-static int core_voltage_pre_transition(struct powernow_k8_data *data,
- u32 reqvid, u32 regfid);
-static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid);
-static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid);
-
-static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index);
-
-static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
-static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
diff --git a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c b/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
deleted file mode 100644
index 435a996a613..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
+++ /dev/null
@@ -1,194 +0,0 @@
-/*
- * sc520_freq.c: cpufreq driver for the AMD Elan sc520
- *
- * Copyright (C) 2005 Sean Young <sean@mess.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Based on elanfreq.c
- *
- * 2005-03-30: - initial revision
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-
-#include <linux/delay.h>
-#include <linux/cpufreq.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-
-#include <asm/msr.h>
-
-#define MMCR_BASE 0xfffef000 /* The default base address */
-#define OFFS_CPUCTL 0x2 /* CPU Control Register */
-
-static __u8 __iomem *cpuctl;
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "sc520_freq", msg)
-#define PFX "sc520_freq: "
-
-static struct cpufreq_frequency_table sc520_freq_table[] = {
- {0x01, 100000},
- {0x02, 133000},
- {0, CPUFREQ_TABLE_END},
-};
-
-static unsigned int sc520_freq_get_cpu_frequency(unsigned int cpu)
-{
- u8 clockspeed_reg = *cpuctl;
-
- switch (clockspeed_reg & 0x03) {
- default:
- printk(KERN_ERR PFX "error: cpuctl register has unexpected "
- "value %02x\n", clockspeed_reg);
- case 0x01:
- return 100000;
- case 0x02:
- return 133000;
- }
-}
-
-static void sc520_freq_set_cpu_state(unsigned int state)
-{
-
- struct cpufreq_freqs freqs;
- u8 clockspeed_reg;
-
- freqs.old = sc520_freq_get_cpu_frequency(0);
- freqs.new = sc520_freq_table[state].frequency;
- freqs.cpu = 0; /* AMD Elan is UP */
-
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
- dprintk("attempting to set frequency to %i kHz\n",
- sc520_freq_table[state].frequency);
-
- local_irq_disable();
-
- clockspeed_reg = *cpuctl & ~0x03;
- *cpuctl = clockspeed_reg | sc520_freq_table[state].index;
-
- local_irq_enable();
-
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-};
-
-static int sc520_freq_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy, &sc520_freq_table[0]);
-}
-
-static int sc520_freq_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- unsigned int newstate = 0;
-
- if (cpufreq_frequency_table_target(policy, sc520_freq_table,
- target_freq, relation, &newstate))
- return -EINVAL;
-
- sc520_freq_set_cpu_state(newstate);
-
- return 0;
-}
-
-
-/*
- * Module init and exit code
- */
-
-static int sc520_freq_cpu_init(struct cpufreq_policy *policy)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
- int result;
-
- /* capability check */
- if (c->x86_vendor != X86_VENDOR_AMD ||
- c->x86 != 4 || c->x86_model != 9)
- return -ENODEV;
-
- /* cpuinfo and default policy values */
- policy->cpuinfo.transition_latency = 1000000; /* 1ms */
- policy->cur = sc520_freq_get_cpu_frequency(0);
-
- result = cpufreq_frequency_table_cpuinfo(policy, sc520_freq_table);
- if (result)
- return result;
-
- cpufreq_frequency_table_get_attr(sc520_freq_table, policy->cpu);
-
- return 0;
-}
-
-
-static int sc520_freq_cpu_exit(struct cpufreq_policy *policy)
-{
- cpufreq_frequency_table_put_attr(policy->cpu);
- return 0;
-}
-
-
-static struct freq_attr *sc520_freq_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-
-static struct cpufreq_driver sc520_freq_driver = {
- .get = sc520_freq_get_cpu_frequency,
- .verify = sc520_freq_verify,
- .target = sc520_freq_target,
- .init = sc520_freq_cpu_init,
- .exit = sc520_freq_cpu_exit,
- .name = "sc520_freq",
- .owner = THIS_MODULE,
- .attr = sc520_freq_attr,
-};
-
-
-static int __init sc520_freq_init(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
- int err;
-
- /* Test if we have the right hardware */
- if (c->x86_vendor != X86_VENDOR_AMD ||
- c->x86 != 4 || c->x86_model != 9) {
- dprintk("no Elan SC520 processor found!\n");
- return -ENODEV;
- }
- cpuctl = ioremap((unsigned long)(MMCR_BASE + OFFS_CPUCTL), 1);
- if (!cpuctl) {
- printk(KERN_ERR "sc520_freq: error: failed to remap memory\n");
- return -ENOMEM;
- }
-
- err = cpufreq_register_driver(&sc520_freq_driver);
- if (err)
- iounmap(cpuctl);
-
- return err;
-}
-
-
-static void __exit sc520_freq_exit(void)
-{
- cpufreq_unregister_driver(&sc520_freq_driver);
- iounmap(cpuctl);
-}
-
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Sean Young <sean@mess.org>");
-MODULE_DESCRIPTION("cpufreq driver for AMD's Elan sc520 CPU");
-
-module_init(sc520_freq_init);
-module_exit(sc520_freq_exit);
-
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
deleted file mode 100644
index 9b1ff37de46..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ /dev/null
@@ -1,636 +0,0 @@
-/*
- * cpufreq driver for Enhanced SpeedStep, as found in Intel's Pentium
- * M (part of the Centrino chipset).
- *
- * Since the original Pentium M, most new Intel CPUs support Enhanced
- * SpeedStep.
- *
- * Despite the "SpeedStep" in the name, this is almost entirely unlike
- * traditional SpeedStep.
- *
- * Modelled on speedstep.c
- *
- * Copyright (C) 2003 Jeremy Fitzhardinge <jeremy@goop.org>
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/sched.h> /* current */
-#include <linux/delay.h>
-#include <linux/compiler.h>
-#include <linux/gfp.h>
-
-#include <asm/msr.h>
-#include <asm/processor.h>
-#include <asm/cpufeature.h>
-
-#define PFX "speedstep-centrino: "
-#define MAINTAINER "cpufreq@vger.kernel.org"
-
-#define dprintk(msg...) \
- cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)
-
-#define INTEL_MSR_RANGE (0xffff)
-
-struct cpu_id
-{
- __u8 x86; /* CPU family */
- __u8 x86_model; /* model */
- __u8 x86_mask; /* stepping */
-};
-
-enum {
- CPU_BANIAS,
- CPU_DOTHAN_A1,
- CPU_DOTHAN_A2,
- CPU_DOTHAN_B0,
- CPU_MP4HT_D0,
- CPU_MP4HT_E0,
-};
-
-static const struct cpu_id cpu_ids[] = {
- [CPU_BANIAS] = { 6, 9, 5 },
- [CPU_DOTHAN_A1] = { 6, 13, 1 },
- [CPU_DOTHAN_A2] = { 6, 13, 2 },
- [CPU_DOTHAN_B0] = { 6, 13, 6 },
- [CPU_MP4HT_D0] = {15, 3, 4 },
- [CPU_MP4HT_E0] = {15, 4, 1 },
-};
-#define N_IDS ARRAY_SIZE(cpu_ids)
-
-struct cpu_model
-{
- const struct cpu_id *cpu_id;
- const char *model_name;
- unsigned max_freq; /* max clock in kHz */
-
- struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */
-};
-static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
- const struct cpu_id *x);
-
-/* Operating points for current CPU */
-static DEFINE_PER_CPU(struct cpu_model *, centrino_model);
-static DEFINE_PER_CPU(const struct cpu_id *, centrino_cpu);
-
-static struct cpufreq_driver centrino_driver;
-
-#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE
-
-/* Computes the correct form for IA32_PERF_CTL MSR for a particular
- frequency/voltage operating point; frequency in MHz, volts in mV.
- This is stored as "index" in the structure. */
-#define OP(mhz, mv) \
- { \
- .frequency = (mhz) * 1000, \
- .index = (((mhz)/100) << 8) | ((mv - 700) / 16) \
- }
-
-/*
- * These voltage tables were derived from the Intel Pentium M
- * datasheet, document 25261202.pdf, Table 5. I have verified they
- * are consistent with my IBM ThinkPad X31, which has a 1.3GHz Pentium
- * M.
- */
-
-/* Ultra Low Voltage Intel Pentium M processor 900MHz (Banias) */
-static struct cpufreq_frequency_table banias_900[] =
-{
- OP(600, 844),
- OP(800, 988),
- OP(900, 1004),
- { .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Ultra Low Voltage Intel Pentium M processor 1000MHz (Banias) */
-static struct cpufreq_frequency_table banias_1000[] =
-{
- OP(600, 844),
- OP(800, 972),
- OP(900, 988),
- OP(1000, 1004),
- { .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Low Voltage Intel Pentium M processor 1.10GHz (Banias) */
-static struct cpufreq_frequency_table banias_1100[] =
-{
- OP( 600, 956),
- OP( 800, 1020),
- OP( 900, 1100),
- OP(1000, 1164),
- OP(1100, 1180),
- { .frequency = CPUFREQ_TABLE_END }
-};
-
-
-/* Low Voltage Intel Pentium M processor 1.20GHz (Banias) */
-static struct cpufreq_frequency_table banias_1200[] =
-{
- OP( 600, 956),
- OP( 800, 1004),
- OP( 900, 1020),
- OP(1000, 1100),
- OP(1100, 1164),
- OP(1200, 1180),
- { .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Intel Pentium M processor 1.30GHz (Banias) */
-static struct cpufreq_frequency_table banias_1300[] =
-{
- OP( 600, 956),
- OP( 800, 1260),
- OP(1000, 1292),
- OP(1200, 1356),
- OP(1300, 1388),
- { .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Intel Pentium M processor 1.40GHz (Banias) */
-static struct cpufreq_frequency_table banias_1400[] =
-{
- OP( 600, 956),
- OP( 800, 1180),
- OP(1000, 1308),
- OP(1200, 1436),
- OP(1400, 1484),
- { .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Intel Pentium M processor 1.50GHz (Banias) */
-static struct cpufreq_frequency_table banias_1500[] =
-{
- OP( 600, 956),
- OP( 800, 1116),
- OP(1000, 1228),
- OP(1200, 1356),
- OP(1400, 1452),
- OP(1500, 1484),
- { .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Intel Pentium M processor 1.60GHz (Banias) */
-static struct cpufreq_frequency_table banias_1600[] =
-{
- OP( 600, 956),
- OP( 800, 1036),
- OP(1000, 1164),
- OP(1200, 1276),
- OP(1400, 1420),
- OP(1600, 1484),
- { .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Intel Pentium M processor 1.70GHz (Banias) */
-static struct cpufreq_frequency_table banias_1700[] =
-{
- OP( 600, 956),
- OP( 800, 1004),
- OP(1000, 1116),
- OP(1200, 1228),
- OP(1400, 1308),
- OP(1700, 1484),
- { .frequency = CPUFREQ_TABLE_END }
-};
-#undef OP
-
-#define _BANIAS(cpuid, max, name) \
-{ .cpu_id = cpuid, \
- .model_name = "Intel(R) Pentium(R) M processor " name "MHz", \
- .max_freq = (max)*1000, \
- .op_points = banias_##max, \
-}
-#define BANIAS(max) _BANIAS(&cpu_ids[CPU_BANIAS], max, #max)
-
-/* CPU models, their operating frequency range, and freq/voltage
- operating points */
-static struct cpu_model models[] =
-{
- _BANIAS(&cpu_ids[CPU_BANIAS], 900, " 900"),
- BANIAS(1000),
- BANIAS(1100),
- BANIAS(1200),
- BANIAS(1300),
- BANIAS(1400),
- BANIAS(1500),
- BANIAS(1600),
- BANIAS(1700),
-
- /* NULL model_name is a wildcard */
- { &cpu_ids[CPU_DOTHAN_A1], NULL, 0, NULL },
- { &cpu_ids[CPU_DOTHAN_A2], NULL, 0, NULL },
- { &cpu_ids[CPU_DOTHAN_B0], NULL, 0, NULL },
- { &cpu_ids[CPU_MP4HT_D0], NULL, 0, NULL },
- { &cpu_ids[CPU_MP4HT_E0], NULL, 0, NULL },
-
- { NULL, }
-};
-#undef _BANIAS
-#undef BANIAS
-
-static int centrino_cpu_init_table(struct cpufreq_policy *policy)
-{
- struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu);
- struct cpu_model *model;
-
- for(model = models; model->cpu_id != NULL; model++)
- if (centrino_verify_cpu_id(cpu, model->cpu_id) &&
- (model->model_name == NULL ||
- strcmp(cpu->x86_model_id, model->model_name) == 0))
- break;
-
- if (model->cpu_id == NULL) {
- /* No match at all */
- dprintk("no support for CPU model \"%s\": "
- "send /proc/cpuinfo to " MAINTAINER "\n",
- cpu->x86_model_id);
- return -ENOENT;
- }
-
- if (model->op_points == NULL) {
- /* Matched a non-match */
- dprintk("no table support for CPU model \"%s\"\n",
- cpu->x86_model_id);
- dprintk("try using the acpi-cpufreq driver\n");
- return -ENOENT;
- }
-
- per_cpu(centrino_model, policy->cpu) = model;
-
- dprintk("found \"%s\": max frequency: %dkHz\n",
- model->model_name, model->max_freq);
-
- return 0;
-}
-
-#else
-static inline int centrino_cpu_init_table(struct cpufreq_policy *policy)
-{
- return -ENODEV;
-}
-#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */
-
-static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
- const struct cpu_id *x)
-{
- if ((c->x86 == x->x86) &&
- (c->x86_model == x->x86_model) &&
- (c->x86_mask == x->x86_mask))
- return 1;
- return 0;
-}
-
-/* To be called only after centrino_model is initialized */
-static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe)
-{
- int i;
-
- /*
- * Extract clock in kHz from PERF_CTL value
- * for centrino, as some DSDTs are buggy.
- * Ideally, this can be done using the acpi_data structure.
- */
- if ((per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_BANIAS]) ||
- (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_A1]) ||
- (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_B0])) {
- msr = (msr >> 8) & 0xff;
- return msr * 100000;
- }
-
- if ((!per_cpu(centrino_model, cpu)) ||
- (!per_cpu(centrino_model, cpu)->op_points))
- return 0;
-
- msr &= 0xffff;
- for (i = 0;
- per_cpu(centrino_model, cpu)->op_points[i].frequency
- != CPUFREQ_TABLE_END;
- i++) {
- if (msr == per_cpu(centrino_model, cpu)->op_points[i].index)
- return per_cpu(centrino_model, cpu)->
- op_points[i].frequency;
- }
- if (failsafe)
- return per_cpu(centrino_model, cpu)->op_points[i-1].frequency;
- else
- return 0;
-}
-
-/* Return the current CPU frequency in kHz */
-static unsigned int get_cur_freq(unsigned int cpu)
-{
- unsigned l, h;
- unsigned clock_freq;
-
- rdmsr_on_cpu(cpu, MSR_IA32_PERF_STATUS, &l, &h);
- clock_freq = extract_clock(l, cpu, 0);
-
- if (unlikely(clock_freq == 0)) {
- /*
- * On some CPUs, we can see transient MSR values (which are
- * not present in _PSS), while CPU is doing some automatic
- * P-state transition (like TM2). Get the last freq set
- * in PERF_CTL.
- */
- rdmsr_on_cpu(cpu, MSR_IA32_PERF_CTL, &l, &h);
- clock_freq = extract_clock(l, cpu, 1);
- }
- return clock_freq;
-}
-
-
-static int centrino_cpu_init(struct cpufreq_policy *policy)
-{
- struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu);
- unsigned freq;
- unsigned l, h;
- int ret;
- int i;
-
- /* Only Intel makes Enhanced Speedstep-capable CPUs */
- if (cpu->x86_vendor != X86_VENDOR_INTEL ||
- !cpu_has(cpu, X86_FEATURE_EST))
- return -ENODEV;
-
- if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC))
- centrino_driver.flags |= CPUFREQ_CONST_LOOPS;
-
- if (policy->cpu != 0)
- return -ENODEV;
-
- for (i = 0; i < N_IDS; i++)
- if (centrino_verify_cpu_id(cpu, &cpu_ids[i]))
- break;
-
- if (i != N_IDS)
- per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i];
-
- if (!per_cpu(centrino_cpu, policy->cpu)) {
- dprintk("found unsupported CPU with "
- "Enhanced SpeedStep: send /proc/cpuinfo to "
- MAINTAINER "\n");
- return -ENODEV;
- }
-
- if (centrino_cpu_init_table(policy)) {
- return -ENODEV;
- }
-
- /* Check to see if Enhanced SpeedStep is enabled, and try to
- enable it if not. */
- rdmsr(MSR_IA32_MISC_ENABLE, l, h);
-
- if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
- l |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;
- dprintk("trying to enable Enhanced SpeedStep (%x)\n", l);
- wrmsr(MSR_IA32_MISC_ENABLE, l, h);
-
- /* check to see if it stuck */
- rdmsr(MSR_IA32_MISC_ENABLE, l, h);
- if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
- printk(KERN_INFO PFX
- "couldn't enable Enhanced SpeedStep\n");
- return -ENODEV;
- }
- }
-
- freq = get_cur_freq(policy->cpu);
- policy->cpuinfo.transition_latency = 10000;
- /* 10uS transition latency */
- policy->cur = freq;
-
- dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur);
-
- ret = cpufreq_frequency_table_cpuinfo(policy,
- per_cpu(centrino_model, policy->cpu)->op_points);
- if (ret)
- return (ret);
-
- cpufreq_frequency_table_get_attr(
- per_cpu(centrino_model, policy->cpu)->op_points, policy->cpu);
-
- return 0;
-}
-
-static int centrino_cpu_exit(struct cpufreq_policy *policy)
-{
- unsigned int cpu = policy->cpu;
-
- if (!per_cpu(centrino_model, cpu))
- return -ENODEV;
-
- cpufreq_frequency_table_put_attr(cpu);
-
- per_cpu(centrino_model, cpu) = NULL;
-
- return 0;
-}
-
-/**
- * centrino_verify - verifies a new CPUFreq policy
- * @policy: new policy
- *
- * Limit must be within this model's frequency range at least one
- * border included.
- */
-static int centrino_verify (struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy,
- per_cpu(centrino_model, policy->cpu)->op_points);
-}
-
-/**
- * centrino_setpolicy - set a new CPUFreq policy
- * @policy: new policy
- * @target_freq: the target frequency
- * @relation: how that frequency relates to achieved frequency
- * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
- *
- * Sets a new CPUFreq policy.
- */
-static int centrino_target (struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- unsigned int newstate = 0;
- unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu;
- struct cpufreq_freqs freqs;
- int retval = 0;
- unsigned int j, k, first_cpu, tmp;
- cpumask_var_t covered_cpus;
-
- if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL)))
- return -ENOMEM;
-
- if (unlikely(per_cpu(centrino_model, cpu) == NULL)) {
- retval = -ENODEV;
- goto out;
- }
-
- if (unlikely(cpufreq_frequency_table_target(policy,
- per_cpu(centrino_model, cpu)->op_points,
- target_freq,
- relation,
- &newstate))) {
- retval = -EINVAL;
- goto out;
- }
-
- first_cpu = 1;
- for_each_cpu(j, policy->cpus) {
- int good_cpu;
-
- /* cpufreq holds the hotplug lock, so we are safe here */
- if (!cpu_online(j))
- continue;
-
- /*
- * Support for SMP systems.
- * Make sure we are running on CPU that wants to change freq
- */
- if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
- good_cpu = cpumask_any_and(policy->cpus,
- cpu_online_mask);
- else
- good_cpu = j;
-
- if (good_cpu >= nr_cpu_ids) {
- dprintk("couldn't limit to CPUs in this domain\n");
- retval = -EAGAIN;
- if (first_cpu) {
- /* We haven't started the transition yet. */
- goto out;
- }
- break;
- }
-
- msr = per_cpu(centrino_model, cpu)->op_points[newstate].index;
-
- if (first_cpu) {
- rdmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, &oldmsr, &h);
- if (msr == (oldmsr & 0xffff)) {
- dprintk("no change needed - msr was and needs "
- "to be %x\n", oldmsr);
- retval = 0;
- goto out;
- }
-
- freqs.old = extract_clock(oldmsr, cpu, 0);
- freqs.new = extract_clock(msr, cpu, 0);
-
- dprintk("target=%dkHz old=%d new=%d msr=%04x\n",
- target_freq, freqs.old, freqs.new, msr);
-
- for_each_cpu(k, policy->cpus) {
- if (!cpu_online(k))
- continue;
- freqs.cpu = k;
- cpufreq_notify_transition(&freqs,
- CPUFREQ_PRECHANGE);
- }
-
- first_cpu = 0;
- /* all but 16 LSB are reserved, treat them with care */
- oldmsr &= ~0xffff;
- msr &= 0xffff;
- oldmsr |= msr;
- }
-
- wrmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, oldmsr, h);
- if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
- break;
-
- cpumask_set_cpu(j, covered_cpus);
- }
-
- for_each_cpu(k, policy->cpus) {
- if (!cpu_online(k))
- continue;
- freqs.cpu = k;
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- }
-
- if (unlikely(retval)) {
- /*
- * We have failed halfway through the frequency change.
- * We have sent callbacks to policy->cpus and
- * MSRs have already been written on coverd_cpus.
- * Best effort undo..
- */
-
- for_each_cpu(j, covered_cpus)
- wrmsr_on_cpu(j, MSR_IA32_PERF_CTL, oldmsr, h);
-
- tmp = freqs.new;
- freqs.new = freqs.old;
- freqs.old = tmp;
- for_each_cpu(j, policy->cpus) {
- if (!cpu_online(j))
- continue;
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- }
- }
- retval = 0;
-
-out:
- free_cpumask_var(covered_cpus);
- return retval;
-}
-
-static struct freq_attr* centrino_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-static struct cpufreq_driver centrino_driver = {
- .name = "centrino", /* should be speedstep-centrino,
- but there's a 16 char limit */
- .init = centrino_cpu_init,
- .exit = centrino_cpu_exit,
- .verify = centrino_verify,
- .target = centrino_target,
- .get = get_cur_freq,
- .attr = centrino_attr,
- .owner = THIS_MODULE,
-};
-
-
-/**
- * centrino_init - initializes the Enhanced SpeedStep CPUFreq driver
- *
- * Initializes the Enhanced SpeedStep support. Returns -ENODEV on
- * unsupported devices, -ENOENT if there's no voltage table for this
- * particular CPU model, -EINVAL on problems during initiatization,
- * and zero on success.
- *
- * This is quite picky. Not only does the CPU have to advertise the
- * "est" flag in the cpuid capability flags, we look for a specific
- * CPU model and stepping, and we need to have the exact model name in
- * our voltage tables. That is, be paranoid about not releasing
- * someone's valuable magic smoke.
- */
-static int __init centrino_init(void)
-{
- struct cpuinfo_x86 *cpu = &cpu_data(0);
-
- if (!cpu_has(cpu, X86_FEATURE_EST))
- return -ENODEV;
-
- return cpufreq_register_driver(&centrino_driver);
-}
-
-static void __exit centrino_exit(void)
-{
- cpufreq_unregister_driver(&centrino_driver);
-}
-
-MODULE_AUTHOR ("Jeremy Fitzhardinge <jeremy@goop.org>");
-MODULE_DESCRIPTION ("Enhanced SpeedStep driver for Intel Pentium M processors.");
-MODULE_LICENSE ("GPL");
-
-late_initcall(centrino_init);
-module_exit(centrino_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
deleted file mode 100644
index 561758e9518..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ /dev/null
@@ -1,452 +0,0 @@
-/*
- * (C) 2001 Dave Jones, Arjan van de ven.
- * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
- *
- * Licensed under the terms of the GNU GPL License version 2.
- * Based upon reverse engineered information, and on Intel documentation
- * for chipsets ICH2-M and ICH3-M.
- *
- * Many thanks to Ducrot Bruno for finding and fixing the last
- * "missing link" for ICH2-M/ICH3-M support, and to Thomas Winkler
- * for extensive testing.
- *
- * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-
-/*********************************************************************
- * SPEEDSTEP - DEFINITIONS *
- *********************************************************************/
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/pci.h>
-#include <linux/sched.h>
-
-#include "speedstep-lib.h"
-
-
-/* speedstep_chipset:
- * It is necessary to know which chipset is used. As accesses to
- * this device occur at various places in this module, we need a
- * static struct pci_dev * pointing to that device.
- */
-static struct pci_dev *speedstep_chipset_dev;
-
-
-/* speedstep_processor
- */
-static enum speedstep_processor speedstep_processor;
-
-static u32 pmbase;
-
-/*
- * There are only two frequency states for each processor. Values
- * are in kHz for the time being.
- */
-static struct cpufreq_frequency_table speedstep_freqs[] = {
- {SPEEDSTEP_HIGH, 0},
- {SPEEDSTEP_LOW, 0},
- {0, CPUFREQ_TABLE_END},
-};
-
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "speedstep-ich", msg)
-
-
-/**
- * speedstep_find_register - read the PMBASE address
- *
- * Returns: -ENODEV if no register could be found
- */
-static int speedstep_find_register(void)
-{
- if (!speedstep_chipset_dev)
- return -ENODEV;
-
- /* get PMBASE */
- pci_read_config_dword(speedstep_chipset_dev, 0x40, &pmbase);
- if (!(pmbase & 0x01)) {
- printk(KERN_ERR "speedstep-ich: could not find speedstep register\n");
- return -ENODEV;
- }
-
- pmbase &= 0xFFFFFFFE;
- if (!pmbase) {
- printk(KERN_ERR "speedstep-ich: could not find speedstep register\n");
- return -ENODEV;
- }
-
- dprintk("pmbase is 0x%x\n", pmbase);
- return 0;
-}
-
-/**
- * speedstep_set_state - set the SpeedStep state
- * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
- *
- * Tries to change the SpeedStep state. Can be called from
- * smp_call_function_single.
- */
-static void speedstep_set_state(unsigned int state)
-{
- u8 pm2_blk;
- u8 value;
- unsigned long flags;
-
- if (state > 0x1)
- return;
-
- /* Disable IRQs */
- local_irq_save(flags);
-
- /* read state */
- value = inb(pmbase + 0x50);
-
- dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
-
- /* write new state */
- value &= 0xFE;
- value |= state;
-
- dprintk("writing 0x%x to pmbase 0x%x + 0x50\n", value, pmbase);
-
- /* Disable bus master arbitration */
- pm2_blk = inb(pmbase + 0x20);
- pm2_blk |= 0x01;
- outb(pm2_blk, (pmbase + 0x20));
-
- /* Actual transition */
- outb(value, (pmbase + 0x50));
-
- /* Restore bus master arbitration */
- pm2_blk &= 0xfe;
- outb(pm2_blk, (pmbase + 0x20));
-
- /* check if transition was successful */
- value = inb(pmbase + 0x50);
-
- /* Enable IRQs */
- local_irq_restore(flags);
-
- dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
-
- if (state == (value & 0x1))
- dprintk("change to %u MHz succeeded\n",
- speedstep_get_frequency(speedstep_processor) / 1000);
- else
- printk(KERN_ERR "cpufreq: change failed - I/O error\n");
-
- return;
-}
-
-/* Wrapper for smp_call_function_single. */
-static void _speedstep_set_state(void *_state)
-{
- speedstep_set_state(*(unsigned int *)_state);
-}
-
-/**
- * speedstep_activate - activate SpeedStep control in the chipset
- *
- * Tries to activate the SpeedStep status and control registers.
- * Returns -EINVAL on an unsupported chipset, and zero on success.
- */
-static int speedstep_activate(void)
-{
- u16 value = 0;
-
- if (!speedstep_chipset_dev)
- return -EINVAL;
-
- pci_read_config_word(speedstep_chipset_dev, 0x00A0, &value);
- if (!(value & 0x08)) {
- value |= 0x08;
- dprintk("activating SpeedStep (TM) registers\n");
- pci_write_config_word(speedstep_chipset_dev, 0x00A0, value);
- }
-
- return 0;
-}
-
-
-/**
- * speedstep_detect_chipset - detect the Southbridge which contains SpeedStep logic
- *
- * Detects ICH2-M, ICH3-M and ICH4-M so far. The pci_dev points to
- * the LPC bridge / PM module which contains all power-management
- * functions. Returns the SPEEDSTEP_CHIPSET_-number for the detected
- * chipset, or zero on failure.
- */
-static unsigned int speedstep_detect_chipset(void)
-{
- speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
- PCI_DEVICE_ID_INTEL_82801DB_12,
- PCI_ANY_ID, PCI_ANY_ID,
- NULL);
- if (speedstep_chipset_dev)
- return 4; /* 4-M */
-
- speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
- PCI_DEVICE_ID_INTEL_82801CA_12,
- PCI_ANY_ID, PCI_ANY_ID,
- NULL);
- if (speedstep_chipset_dev)
- return 3; /* 3-M */
-
-
- speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
- PCI_DEVICE_ID_INTEL_82801BA_10,
- PCI_ANY_ID, PCI_ANY_ID,
- NULL);
- if (speedstep_chipset_dev) {
- /* speedstep.c causes lockups on Dell Inspirons 8000 and
- * 8100 which use a pretty old revision of the 82815
- * host brige. Abort on these systems.
- */
- static struct pci_dev *hostbridge;
-
- hostbridge = pci_get_subsys(PCI_VENDOR_ID_INTEL,
- PCI_DEVICE_ID_INTEL_82815_MC,
- PCI_ANY_ID, PCI_ANY_ID,
- NULL);
-
- if (!hostbridge)
- return 2; /* 2-M */
-
- if (hostbridge->revision < 5) {
- dprintk("hostbridge does not support speedstep\n");
- speedstep_chipset_dev = NULL;
- pci_dev_put(hostbridge);
- return 0;
- }
-
- pci_dev_put(hostbridge);
- return 2; /* 2-M */
- }
-
- return 0;
-}
-
-static void get_freq_data(void *_speed)
-{
- unsigned int *speed = _speed;
-
- *speed = speedstep_get_frequency(speedstep_processor);
-}
-
-static unsigned int speedstep_get(unsigned int cpu)
-{
- unsigned int speed;
-
- /* You're supposed to ensure CPU is online. */
- if (smp_call_function_single(cpu, get_freq_data, &speed, 1) != 0)
- BUG();
-
- dprintk("detected %u kHz as current frequency\n", speed);
- return speed;
-}
-
-/**
- * speedstep_target - set a new CPUFreq policy
- * @policy: new policy
- * @target_freq: the target frequency
- * @relation: how that frequency relates to achieved frequency
- * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
- *
- * Sets a new CPUFreq policy.
- */
-static int speedstep_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- unsigned int newstate = 0, policy_cpu;
- struct cpufreq_freqs freqs;
- int i;
-
- if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
- target_freq, relation, &newstate))
- return -EINVAL;
-
- policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
- freqs.old = speedstep_get(policy_cpu);
- freqs.new = speedstep_freqs[newstate].frequency;
- freqs.cpu = policy->cpu;
-
- dprintk("transiting from %u to %u kHz\n", freqs.old, freqs.new);
-
- /* no transition necessary */
- if (freqs.old == freqs.new)
- return 0;
-
- for_each_cpu(i, policy->cpus) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
- }
-
- smp_call_function_single(policy_cpu, _speedstep_set_state, &newstate,
- true);
-
- for_each_cpu(i, policy->cpus) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- }
-
- return 0;
-}
-
-
-/**
- * speedstep_verify - verifies a new CPUFreq policy
- * @policy: new policy
- *
- * Limit must be within speedstep_low_freq and speedstep_high_freq, with
- * at least one border included.
- */
-static int speedstep_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
-}
-
-struct get_freqs {
- struct cpufreq_policy *policy;
- int ret;
-};
-
-static void get_freqs_on_cpu(void *_get_freqs)
-{
- struct get_freqs *get_freqs = _get_freqs;
-
- get_freqs->ret =
- speedstep_get_freqs(speedstep_processor,
- &speedstep_freqs[SPEEDSTEP_LOW].frequency,
- &speedstep_freqs[SPEEDSTEP_HIGH].frequency,
- &get_freqs->policy->cpuinfo.transition_latency,
- &speedstep_set_state);
-}
-
-static int speedstep_cpu_init(struct cpufreq_policy *policy)
-{
- int result;
- unsigned int policy_cpu, speed;
- struct get_freqs gf;
-
- /* only run on CPU to be set, or on its sibling */
-#ifdef CONFIG_SMP
- cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
-#endif
- policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
-
- /* detect low and high frequency and transition latency */
- gf.policy = policy;
- smp_call_function_single(policy_cpu, get_freqs_on_cpu, &gf, 1);
- if (gf.ret)
- return gf.ret;
-
- /* get current speed setting */
- speed = speedstep_get(policy_cpu);
- if (!speed)
- return -EIO;
-
- dprintk("currently at %s speed setting - %i MHz\n",
- (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency)
- ? "low" : "high",
- (speed / 1000));
-
- /* cpuinfo and default policy values */
- policy->cur = speed;
-
- result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
- if (result)
- return result;
-
- cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
-
- return 0;
-}
-
-
-static int speedstep_cpu_exit(struct cpufreq_policy *policy)
-{
- cpufreq_frequency_table_put_attr(policy->cpu);
- return 0;
-}
-
-static struct freq_attr *speedstep_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-
-static struct cpufreq_driver speedstep_driver = {
- .name = "speedstep-ich",
- .verify = speedstep_verify,
- .target = speedstep_target,
- .init = speedstep_cpu_init,
- .exit = speedstep_cpu_exit,
- .get = speedstep_get,
- .owner = THIS_MODULE,
- .attr = speedstep_attr,
-};
-
-
-/**
- * speedstep_init - initializes the SpeedStep CPUFreq driver
- *
- * Initializes the SpeedStep support. Returns -ENODEV on unsupported
- * devices, -EINVAL on problems during initiatization, and zero on
- * success.
- */
-static int __init speedstep_init(void)
-{
- /* detect processor */
- speedstep_processor = speedstep_detect_processor();
- if (!speedstep_processor) {
- dprintk("Intel(R) SpeedStep(TM) capable processor "
- "not found\n");
- return -ENODEV;
- }
-
- /* detect chipset */
- if (!speedstep_detect_chipset()) {
- dprintk("Intel(R) SpeedStep(TM) for this chipset not "
- "(yet) available.\n");
- return -ENODEV;
- }
-
- /* activate speedstep support */
- if (speedstep_activate()) {
- pci_dev_put(speedstep_chipset_dev);
- return -EINVAL;
- }
-
- if (speedstep_find_register())
- return -ENODEV;
-
- return cpufreq_register_driver(&speedstep_driver);
-}
-
-
-/**
- * speedstep_exit - unregisters SpeedStep support
- *
- * Unregisters SpeedStep support.
- */
-static void __exit speedstep_exit(void)
-{
- pci_dev_put(speedstep_chipset_dev);
- cpufreq_unregister_driver(&speedstep_driver);
-}
-
-
-MODULE_AUTHOR("Dave Jones <davej@redhat.com>, "
- "Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION("Speedstep driver for Intel mobile processors on chipsets "
- "with ICH-M southbridges.");
-MODULE_LICENSE("GPL");
-
-module_init(speedstep_init);
-module_exit(speedstep_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
deleted file mode 100644
index a94ec6be69f..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ /dev/null
@@ -1,481 +0,0 @@
-/*
- * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
- *
- * Licensed under the terms of the GNU GPL License version 2.
- *
- * Library for common functions for Intel SpeedStep v.1 and v.2 support
- *
- * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-
-#include <asm/msr.h>
-#include <asm/tsc.h>
-#include "speedstep-lib.h"
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "speedstep-lib", msg)
-
-#define PFX "speedstep-lib: "
-
-#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
-static int relaxed_check;
-#else
-#define relaxed_check 0
-#endif
-
-/*********************************************************************
- * GET PROCESSOR CORE SPEED IN KHZ *
- *********************************************************************/
-
-static unsigned int pentium3_get_frequency(enum speedstep_processor processor)
-{
- /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */
- struct {
- unsigned int ratio; /* Frequency Multiplier (x10) */
- u8 bitmap; /* power on configuration bits
- [27, 25:22] (in MSR 0x2a) */
- } msr_decode_mult[] = {
- { 30, 0x01 },
- { 35, 0x05 },
- { 40, 0x02 },
- { 45, 0x06 },
- { 50, 0x00 },
- { 55, 0x04 },
- { 60, 0x0b },
- { 65, 0x0f },
- { 70, 0x09 },
- { 75, 0x0d },
- { 80, 0x0a },
- { 85, 0x26 },
- { 90, 0x20 },
- { 100, 0x2b },
- { 0, 0xff } /* error or unknown value */
- };
-
- /* PIII(-M) FSB settings: see table b1-b of 24547206.pdf */
- struct {
- unsigned int value; /* Front Side Bus speed in MHz */
- u8 bitmap; /* power on configuration bits [18: 19]
- (in MSR 0x2a) */
- } msr_decode_fsb[] = {
- { 66, 0x0 },
- { 100, 0x2 },
- { 133, 0x1 },
- { 0, 0xff}
- };
-
- u32 msr_lo, msr_tmp;
- int i = 0, j = 0;
-
- /* read MSR 0x2a - we only need the low 32 bits */
- rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
- dprintk("P3 - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
- msr_tmp = msr_lo;
-
- /* decode the FSB */
- msr_tmp &= 0x00c0000;
- msr_tmp >>= 18;
- while (msr_tmp != msr_decode_fsb[i].bitmap) {
- if (msr_decode_fsb[i].bitmap == 0xff)
- return 0;
- i++;
- }
-
- /* decode the multiplier */
- if (processor == SPEEDSTEP_CPU_PIII_C_EARLY) {
- dprintk("workaround for early PIIIs\n");
- msr_lo &= 0x03c00000;
- } else
- msr_lo &= 0x0bc00000;
- msr_lo >>= 22;
- while (msr_lo != msr_decode_mult[j].bitmap) {
- if (msr_decode_mult[j].bitmap == 0xff)
- return 0;
- j++;
- }
-
- dprintk("speed is %u\n",
- (msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100));
-
- return msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100;
-}
-
-
-static unsigned int pentiumM_get_frequency(void)
-{
- u32 msr_lo, msr_tmp;
-
- rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
- dprintk("PM - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
-
- /* see table B-2 of 24547212.pdf */
- if (msr_lo & 0x00040000) {
- printk(KERN_DEBUG PFX "PM - invalid FSB: 0x%x 0x%x\n",
- msr_lo, msr_tmp);
- return 0;
- }
-
- msr_tmp = (msr_lo >> 22) & 0x1f;
- dprintk("bits 22-26 are 0x%x, speed is %u\n",
- msr_tmp, (msr_tmp * 100 * 1000));
-
- return msr_tmp * 100 * 1000;
-}
-
-static unsigned int pentium_core_get_frequency(void)
-{
- u32 fsb = 0;
- u32 msr_lo, msr_tmp;
- int ret;
-
- rdmsr(MSR_FSB_FREQ, msr_lo, msr_tmp);
- /* see table B-2 of 25366920.pdf */
- switch (msr_lo & 0x07) {
- case 5:
- fsb = 100000;
- break;
- case 1:
- fsb = 133333;
- break;
- case 3:
- fsb = 166667;
- break;
- case 2:
- fsb = 200000;
- break;
- case 0:
- fsb = 266667;
- break;
- case 4:
- fsb = 333333;
- break;
- default:
- printk(KERN_ERR "PCORE - MSR_FSB_FREQ undefined value");
- }
-
- rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
- dprintk("PCORE - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n",
- msr_lo, msr_tmp);
-
- msr_tmp = (msr_lo >> 22) & 0x1f;
- dprintk("bits 22-26 are 0x%x, speed is %u\n",
- msr_tmp, (msr_tmp * fsb));
-
- ret = (msr_tmp * fsb);
- return ret;
-}
-
-
-static unsigned int pentium4_get_frequency(void)
-{
- struct cpuinfo_x86 *c = &boot_cpu_data;
- u32 msr_lo, msr_hi, mult;
- unsigned int fsb = 0;
- unsigned int ret;
- u8 fsb_code;
-
- /* Pentium 4 Model 0 and 1 do not have the Core Clock Frequency
- * to System Bus Frequency Ratio Field in the Processor Frequency
- * Configuration Register of the MSR. Therefore the current
- * frequency cannot be calculated and has to be measured.
- */
- if (c->x86_model < 2)
- return cpu_khz;
-
- rdmsr(0x2c, msr_lo, msr_hi);
-
- dprintk("P4 - MSR_EBC_FREQUENCY_ID: 0x%x 0x%x\n", msr_lo, msr_hi);
-
- /* decode the FSB: see IA-32 Intel (C) Architecture Software
- * Developer's Manual, Volume 3: System Prgramming Guide,
- * revision #12 in Table B-1: MSRs in the Pentium 4 and
- * Intel Xeon Processors, on page B-4 and B-5.
- */
- fsb_code = (msr_lo >> 16) & 0x7;
- switch (fsb_code) {
- case 0:
- fsb = 100 * 1000;
- break;
- case 1:
- fsb = 13333 * 10;
- break;
- case 2:
- fsb = 200 * 1000;
- break;
- }
-
- if (!fsb)
- printk(KERN_DEBUG PFX "couldn't detect FSB speed. "
- "Please send an e-mail to <linux@brodo.de>\n");
-
- /* Multiplier. */
- mult = msr_lo >> 24;
-
- dprintk("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n",
- fsb, mult, (fsb * mult));
-
- ret = (fsb * mult);
- return ret;
-}
-
-
-/* Warning: may get called from smp_call_function_single. */
-unsigned int speedstep_get_frequency(enum speedstep_processor processor)
-{
- switch (processor) {
- case SPEEDSTEP_CPU_PCORE:
- return pentium_core_get_frequency();
- case SPEEDSTEP_CPU_PM:
- return pentiumM_get_frequency();
- case SPEEDSTEP_CPU_P4D:
- case SPEEDSTEP_CPU_P4M:
- return pentium4_get_frequency();
- case SPEEDSTEP_CPU_PIII_T:
- case SPEEDSTEP_CPU_PIII_C:
- case SPEEDSTEP_CPU_PIII_C_EARLY:
- return pentium3_get_frequency(processor);
- default:
- return 0;
- };
- return 0;
-}
-EXPORT_SYMBOL_GPL(speedstep_get_frequency);
-
-
-/*********************************************************************
- * DETECT SPEEDSTEP-CAPABLE PROCESSOR *
- *********************************************************************/
-
-unsigned int speedstep_detect_processor(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
- u32 ebx, msr_lo, msr_hi;
-
- dprintk("x86: %x, model: %x\n", c->x86, c->x86_model);
-
- if ((c->x86_vendor != X86_VENDOR_INTEL) ||
- ((c->x86 != 6) && (c->x86 != 0xF)))
- return 0;
-
- if (c->x86 == 0xF) {
- /* Intel Mobile Pentium 4-M
- * or Intel Mobile Pentium 4 with 533 MHz FSB */
- if (c->x86_model != 2)
- return 0;
-
- ebx = cpuid_ebx(0x00000001);
- ebx &= 0x000000FF;
-
- dprintk("ebx value is %x, x86_mask is %x\n", ebx, c->x86_mask);
-
- switch (c->x86_mask) {
- case 4:
- /*
- * B-stepping [M-P4-M]
- * sample has ebx = 0x0f, production has 0x0e.
- */
- if ((ebx == 0x0e) || (ebx == 0x0f))
- return SPEEDSTEP_CPU_P4M;
- break;
- case 7:
- /*
- * C-stepping [M-P4-M]
- * needs to have ebx=0x0e, else it's a celeron:
- * cf. 25130917.pdf / page 7, footnote 5 even
- * though 25072120.pdf / page 7 doesn't say
- * samples are only of B-stepping...
- */
- if (ebx == 0x0e)
- return SPEEDSTEP_CPU_P4M;
- break;
- case 9:
- /*
- * D-stepping [M-P4-M or M-P4/533]
- *
- * this is totally strange: CPUID 0x0F29 is
- * used by M-P4-M, M-P4/533 and(!) Celeron CPUs.
- * The latter need to be sorted out as they don't
- * support speedstep.
- * Celerons with CPUID 0x0F29 may have either
- * ebx=0x8 or 0xf -- 25130917.pdf doesn't say anything
- * specific.
- * M-P4-Ms may have either ebx=0xe or 0xf [see above]
- * M-P4/533 have either ebx=0xe or 0xf. [25317607.pdf]
- * also, M-P4M HTs have ebx=0x8, too
- * For now, they are distinguished by the model_id
- * string
- */
- if ((ebx == 0x0e) ||
- (strstr(c->x86_model_id,
- "Mobile Intel(R) Pentium(R) 4") != NULL))
- return SPEEDSTEP_CPU_P4M;
- break;
- default:
- break;
- }
- return 0;
- }
-
- switch (c->x86_model) {
- case 0x0B: /* Intel PIII [Tualatin] */
- /* cpuid_ebx(1) is 0x04 for desktop PIII,
- * 0x06 for mobile PIII-M */
- ebx = cpuid_ebx(0x00000001);
- dprintk("ebx is %x\n", ebx);
-
- ebx &= 0x000000FF;
-
- if (ebx != 0x06)
- return 0;
-
- /* So far all PIII-M processors support SpeedStep. See
- * Intel's 24540640.pdf of June 2003
- */
- return SPEEDSTEP_CPU_PIII_T;
-
- case 0x08: /* Intel PIII [Coppermine] */
-
- /* all mobile PIII Coppermines have FSB 100 MHz
- * ==> sort out a few desktop PIIIs. */
- rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_hi);
- dprintk("Coppermine: MSR_IA32_EBL_CR_POWERON is 0x%x, 0x%x\n",
- msr_lo, msr_hi);
- msr_lo &= 0x00c0000;
- if (msr_lo != 0x0080000)
- return 0;
-
- /*
- * If the processor is a mobile version,
- * platform ID has bit 50 set
- * it has SpeedStep technology if either
- * bit 56 or 57 is set
- */
- rdmsr(MSR_IA32_PLATFORM_ID, msr_lo, msr_hi);
- dprintk("Coppermine: MSR_IA32_PLATFORM ID is 0x%x, 0x%x\n",
- msr_lo, msr_hi);
- if ((msr_hi & (1<<18)) &&
- (relaxed_check ? 1 : (msr_hi & (3<<24)))) {
- if (c->x86_mask == 0x01) {
- dprintk("early PIII version\n");
- return SPEEDSTEP_CPU_PIII_C_EARLY;
- } else
- return SPEEDSTEP_CPU_PIII_C;
- }
-
- default:
- return 0;
- }
-}
-EXPORT_SYMBOL_GPL(speedstep_detect_processor);
-
-
-/*********************************************************************
- * DETECT SPEEDSTEP SPEEDS *
- *********************************************************************/
-
-unsigned int speedstep_get_freqs(enum speedstep_processor processor,
- unsigned int *low_speed,
- unsigned int *high_speed,
- unsigned int *transition_latency,
- void (*set_state) (unsigned int state))
-{
- unsigned int prev_speed;
- unsigned int ret = 0;
- unsigned long flags;
- struct timeval tv1, tv2;
-
- if ((!processor) || (!low_speed) || (!high_speed) || (!set_state))
- return -EINVAL;
-
- dprintk("trying to determine both speeds\n");
-
- /* get current speed */
- prev_speed = speedstep_get_frequency(processor);
- if (!prev_speed)
- return -EIO;
-
- dprintk("previous speed is %u\n", prev_speed);
-
- local_irq_save(flags);
-
- /* switch to low state */
- set_state(SPEEDSTEP_LOW);
- *low_speed = speedstep_get_frequency(processor);
- if (!*low_speed) {
- ret = -EIO;
- goto out;
- }
-
- dprintk("low speed is %u\n", *low_speed);
-
- /* start latency measurement */
- if (transition_latency)
- do_gettimeofday(&tv1);
-
- /* switch to high state */
- set_state(SPEEDSTEP_HIGH);
-
- /* end latency measurement */
- if (transition_latency)
- do_gettimeofday(&tv2);
-
- *high_speed = speedstep_get_frequency(processor);
- if (!*high_speed) {
- ret = -EIO;
- goto out;
- }
-
- dprintk("high speed is %u\n", *high_speed);
-
- if (*low_speed == *high_speed) {
- ret = -ENODEV;
- goto out;
- }
-
- /* switch to previous state, if necessary */
- if (*high_speed != prev_speed)
- set_state(SPEEDSTEP_LOW);
-
- if (transition_latency) {
- *transition_latency = (tv2.tv_sec - tv1.tv_sec) * USEC_PER_SEC +
- tv2.tv_usec - tv1.tv_usec;
- dprintk("transition latency is %u uSec\n", *transition_latency);
-
- /* convert uSec to nSec and add 20% for safety reasons */
- *transition_latency *= 1200;
-
- /* check if the latency measurement is too high or too low
- * and set it to a safe value (500uSec) in that case
- */
- if (*transition_latency > 10000000 ||
- *transition_latency < 50000) {
- printk(KERN_WARNING PFX "frequency transition "
- "measured seems out of range (%u "
- "nSec), falling back to a safe one of"
- "%u nSec.\n",
- *transition_latency, 500000);
- *transition_latency = 500000;
- }
- }
-
-out:
- local_irq_restore(flags);
- return ret;
-}
-EXPORT_SYMBOL_GPL(speedstep_get_freqs);
-
-#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
-module_param(relaxed_check, int, 0444);
-MODULE_PARM_DESC(relaxed_check,
- "Don't do all checks for speedstep capability.");
-#endif
-
-MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION("Library for Intel SpeedStep 1 or 2 cpufreq drivers.");
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
deleted file mode 100644
index 70d9cea1219..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
- *
- * Licensed under the terms of the GNU GPL License version 2.
- *
- * Library for common functions for Intel SpeedStep v.1 and v.2 support
- *
- * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-
-
-/* processors */
-enum speedstep_processor {
- SPEEDSTEP_CPU_PIII_C_EARLY = 0x00000001, /* Coppermine core */
- SPEEDSTEP_CPU_PIII_C = 0x00000002, /* Coppermine core */
- SPEEDSTEP_CPU_PIII_T = 0x00000003, /* Tualatin core */
- SPEEDSTEP_CPU_P4M = 0x00000004, /* P4-M */
-/* the following processors are not speedstep-capable and are not auto-detected
- * in speedstep_detect_processor(). However, their speed can be detected using
- * the speedstep_get_frequency() call. */
- SPEEDSTEP_CPU_PM = 0xFFFFFF03, /* Pentium M */
- SPEEDSTEP_CPU_P4D = 0xFFFFFF04, /* desktop P4 */
- SPEEDSTEP_CPU_PCORE = 0xFFFFFF05, /* Core */
-};
-
-/* speedstep states -- only two of them */
-
-#define SPEEDSTEP_HIGH 0x00000000
-#define SPEEDSTEP_LOW 0x00000001
-
-
-/* detect a speedstep-capable processor */
-extern enum speedstep_processor speedstep_detect_processor(void);
-
-/* detect the current speed (in khz) of the processor */
-extern unsigned int speedstep_get_frequency(enum speedstep_processor processor);
-
-
-/* detect the low and high speeds of the processor. The callback
- * set_state"'s first argument is either SPEEDSTEP_HIGH or
- * SPEEDSTEP_LOW; the second argument is zero so that no
- * cpufreq_notify_transition calls are initiated.
- */
-extern unsigned int speedstep_get_freqs(enum speedstep_processor processor,
- unsigned int *low_speed,
- unsigned int *high_speed,
- unsigned int *transition_latency,
- void (*set_state) (unsigned int state));
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
deleted file mode 100644
index 91bc25b67bc..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
+++ /dev/null
@@ -1,467 +0,0 @@
-/*
- * Intel SpeedStep SMI driver.
- *
- * (C) 2003 Hiroshi Miura <miura@da-cha.org>
- *
- * Licensed under the terms of the GNU GPL License version 2.
- *
- */
-
-
-/*********************************************************************
- * SPEEDSTEP - DEFINITIONS *
- *********************************************************************/
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/delay.h>
-#include <linux/io.h>
-#include <asm/ist.h>
-
-#include "speedstep-lib.h"
-
-/* speedstep system management interface port/command.
- *
- * These parameters are got from IST-SMI BIOS call.
- * If user gives it, these are used.
- *
- */
-static int smi_port;
-static int smi_cmd;
-static unsigned int smi_sig;
-
-/* info about the processor */
-static enum speedstep_processor speedstep_processor;
-
-/*
- * There are only two frequency states for each processor. Values
- * are in kHz for the time being.
- */
-static struct cpufreq_frequency_table speedstep_freqs[] = {
- {SPEEDSTEP_HIGH, 0},
- {SPEEDSTEP_LOW, 0},
- {0, CPUFREQ_TABLE_END},
-};
-
-#define GET_SPEEDSTEP_OWNER 0
-#define GET_SPEEDSTEP_STATE 1
-#define SET_SPEEDSTEP_STATE 2
-#define GET_SPEEDSTEP_FREQS 4
-
-/* how often shall the SMI call be tried if it failed, e.g. because
- * of DMA activity going on? */
-#define SMI_TRIES 5
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "speedstep-smi", msg)
-
-/**
- * speedstep_smi_ownership
- */
-static int speedstep_smi_ownership(void)
-{
- u32 command, result, magic, dummy;
- u32 function = GET_SPEEDSTEP_OWNER;
- unsigned char magic_data[] = "Copyright (c) 1999 Intel Corporation";
-
- command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
- magic = virt_to_phys(magic_data);
-
- dprintk("trying to obtain ownership with command %x at port %x\n",
- command, smi_port);
-
- __asm__ __volatile__(
- "push %%ebp\n"
- "out %%al, (%%dx)\n"
- "pop %%ebp\n"
- : "=D" (result),
- "=a" (dummy), "=b" (dummy), "=c" (dummy), "=d" (dummy),
- "=S" (dummy)
- : "a" (command), "b" (function), "c" (0), "d" (smi_port),
- "D" (0), "S" (magic)
- : "memory"
- );
-
- dprintk("result is %x\n", result);
-
- return result;
-}
-
-/**
- * speedstep_smi_get_freqs - get SpeedStep preferred & current freq.
- * @low: the low frequency value is placed here
- * @high: the high frequency value is placed here
- *
- * Only available on later SpeedStep-enabled systems, returns false results or
- * even hangs [cf. bugme.osdl.org # 1422] on earlier systems. Empirical testing
- * shows that the latter occurs if !(ist_info.event & 0xFFFF).
- */
-static int speedstep_smi_get_freqs(unsigned int *low, unsigned int *high)
-{
- u32 command, result = 0, edi, high_mhz, low_mhz, dummy;
- u32 state = 0;
- u32 function = GET_SPEEDSTEP_FREQS;
-
- if (!(ist_info.event & 0xFFFF)) {
- dprintk("bug #1422 -- can't read freqs from BIOS\n");
- return -ENODEV;
- }
-
- command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
-
- dprintk("trying to determine frequencies with command %x at port %x\n",
- command, smi_port);
-
- __asm__ __volatile__(
- "push %%ebp\n"
- "out %%al, (%%dx)\n"
- "pop %%ebp"
- : "=a" (result),
- "=b" (high_mhz),
- "=c" (low_mhz),
- "=d" (state), "=D" (edi), "=S" (dummy)
- : "a" (command),
- "b" (function),
- "c" (state),
- "d" (smi_port), "S" (0), "D" (0)
- );
-
- dprintk("result %x, low_freq %u, high_freq %u\n",
- result, low_mhz, high_mhz);
-
- /* abort if results are obviously incorrect... */
- if ((high_mhz + low_mhz) < 600)
- return -EINVAL;
-
- *high = high_mhz * 1000;
- *low = low_mhz * 1000;
-
- return result;
-}
-
-/**
- * speedstep_get_state - set the SpeedStep state
- * @state: processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
- *
- */
-static int speedstep_get_state(void)
-{
- u32 function = GET_SPEEDSTEP_STATE;
- u32 result, state, edi, command, dummy;
-
- command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
-
- dprintk("trying to determine current setting with command %x "
- "at port %x\n", command, smi_port);
-
- __asm__ __volatile__(
- "push %%ebp\n"
- "out %%al, (%%dx)\n"
- "pop %%ebp\n"
- : "=a" (result),
- "=b" (state), "=D" (edi),
- "=c" (dummy), "=d" (dummy), "=S" (dummy)
- : "a" (command), "b" (function), "c" (0),
- "d" (smi_port), "S" (0), "D" (0)
- );
-
- dprintk("state is %x, result is %x\n", state, result);
-
- return state & 1;
-}
-
-
-/**
- * speedstep_set_state - set the SpeedStep state
- * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
- *
- */
-static void speedstep_set_state(unsigned int state)
-{
- unsigned int result = 0, command, new_state, dummy;
- unsigned long flags;
- unsigned int function = SET_SPEEDSTEP_STATE;
- unsigned int retry = 0;
-
- if (state > 0x1)
- return;
-
- /* Disable IRQs */
- local_irq_save(flags);
-
- command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
-
- dprintk("trying to set frequency to state %u "
- "with command %x at port %x\n",
- state, command, smi_port);
-
- do {
- if (retry) {
- dprintk("retry %u, previous result %u, waiting...\n",
- retry, result);
- mdelay(retry * 50);
- }
- retry++;
- __asm__ __volatile__(
- "push %%ebp\n"
- "out %%al, (%%dx)\n"
- "pop %%ebp"
- : "=b" (new_state), "=D" (result),
- "=c" (dummy), "=a" (dummy),
- "=d" (dummy), "=S" (dummy)
- : "a" (command), "b" (function), "c" (state),
- "d" (smi_port), "S" (0), "D" (0)
- );
- } while ((new_state != state) && (retry <= SMI_TRIES));
-
- /* enable IRQs */
- local_irq_restore(flags);
-
- if (new_state == state)
- dprintk("change to %u MHz succeeded after %u tries "
- "with result %u\n",
- (speedstep_freqs[new_state].frequency / 1000),
- retry, result);
- else
- printk(KERN_ERR "cpufreq: change to state %u "
- "failed with new_state %u and result %u\n",
- state, new_state, result);
-
- return;
-}
-
-
-/**
- * speedstep_target - set a new CPUFreq policy
- * @policy: new policy
- * @target_freq: new freq
- * @relation:
- *
- * Sets a new CPUFreq policy/freq.
- */
-static int speedstep_target(struct cpufreq_policy *policy,
- unsigned int target_freq, unsigned int relation)
-{
- unsigned int newstate = 0;
- struct cpufreq_freqs freqs;
-
- if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
- target_freq, relation, &newstate))
- return -EINVAL;
-
- freqs.old = speedstep_freqs[speedstep_get_state()].frequency;
- freqs.new = speedstep_freqs[newstate].frequency;
- freqs.cpu = 0; /* speedstep.c is UP only driver */
-
- if (freqs.old == freqs.new)
- return 0;
-
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
- speedstep_set_state(newstate);
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-
- return 0;
-}
-
-
-/**
- * speedstep_verify - verifies a new CPUFreq policy
- * @policy: new policy
- *
- * Limit must be within speedstep_low_freq and speedstep_high_freq, with
- * at least one border included.
- */
-static int speedstep_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
-}
-
-
-static int speedstep_cpu_init(struct cpufreq_policy *policy)
-{
- int result;
- unsigned int speed, state;
- unsigned int *low, *high;
-
- /* capability check */
- if (policy->cpu != 0)
- return -ENODEV;
-
- result = speedstep_smi_ownership();
- if (result) {
- dprintk("fails in acquiring ownership of a SMI interface.\n");
- return -EINVAL;
- }
-
- /* detect low and high frequency */
- low = &speedstep_freqs[SPEEDSTEP_LOW].frequency;
- high = &speedstep_freqs[SPEEDSTEP_HIGH].frequency;
-
- result = speedstep_smi_get_freqs(low, high);
- if (result) {
- /* fall back to speedstep_lib.c dection mechanism:
- * try both states out */
- dprintk("could not detect low and high frequencies "
- "by SMI call.\n");
- result = speedstep_get_freqs(speedstep_processor,
- low, high,
- NULL,
- &speedstep_set_state);
-
- if (result) {
- dprintk("could not detect two different speeds"
- " -- aborting.\n");
- return result;
- } else
- dprintk("workaround worked.\n");
- }
-
- /* get current speed setting */
- state = speedstep_get_state();
- speed = speedstep_freqs[state].frequency;
-
- dprintk("currently at %s speed setting - %i MHz\n",
- (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency)
- ? "low" : "high",
- (speed / 1000));
-
- /* cpuinfo and default policy values */
- policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
- policy->cur = speed;
-
- result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
- if (result)
- return result;
-
- cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
-
- return 0;
-}
-
-static int speedstep_cpu_exit(struct cpufreq_policy *policy)
-{
- cpufreq_frequency_table_put_attr(policy->cpu);
- return 0;
-}
-
-static unsigned int speedstep_get(unsigned int cpu)
-{
- if (cpu)
- return -ENODEV;
- return speedstep_get_frequency(speedstep_processor);
-}
-
-
-static int speedstep_resume(struct cpufreq_policy *policy)
-{
- int result = speedstep_smi_ownership();
-
- if (result)
- dprintk("fails in re-acquiring ownership of a SMI interface.\n");
-
- return result;
-}
-
-static struct freq_attr *speedstep_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-static struct cpufreq_driver speedstep_driver = {
- .name = "speedstep-smi",
- .verify = speedstep_verify,
- .target = speedstep_target,
- .init = speedstep_cpu_init,
- .exit = speedstep_cpu_exit,
- .get = speedstep_get,
- .resume = speedstep_resume,
- .owner = THIS_MODULE,
- .attr = speedstep_attr,
-};
-
-/**
- * speedstep_init - initializes the SpeedStep CPUFreq driver
- *
- * Initializes the SpeedStep support. Returns -ENODEV on unsupported
- * BIOS, -EINVAL on problems during initiatization, and zero on
- * success.
- */
-static int __init speedstep_init(void)
-{
- speedstep_processor = speedstep_detect_processor();
-
- switch (speedstep_processor) {
- case SPEEDSTEP_CPU_PIII_T:
- case SPEEDSTEP_CPU_PIII_C:
- case SPEEDSTEP_CPU_PIII_C_EARLY:
- break;
- default:
- speedstep_processor = 0;
- }
-
- if (!speedstep_processor) {
- dprintk("No supported Intel CPU detected.\n");
- return -ENODEV;
- }
-
- dprintk("signature:0x%.8lx, command:0x%.8lx, "
- "event:0x%.8lx, perf_level:0x%.8lx.\n",
- ist_info.signature, ist_info.command,
- ist_info.event, ist_info.perf_level);
-
- /* Error if no IST-SMI BIOS or no PARM
- sig= 'ISGE' aka 'Intel Speedstep Gate E' */
- if ((ist_info.signature != 0x47534943) && (
- (smi_port == 0) || (smi_cmd == 0)))
- return -ENODEV;
-
- if (smi_sig == 1)
- smi_sig = 0x47534943;
- else
- smi_sig = ist_info.signature;
-
- /* setup smi_port from MODLULE_PARM or BIOS */
- if ((smi_port > 0xff) || (smi_port < 0))
- return -EINVAL;
- else if (smi_port == 0)
- smi_port = ist_info.command & 0xff;
-
- if ((smi_cmd > 0xff) || (smi_cmd < 0))
- return -EINVAL;
- else if (smi_cmd == 0)
- smi_cmd = (ist_info.command >> 16) & 0xff;
-
- return cpufreq_register_driver(&speedstep_driver);
-}
-
-
-/**
- * speedstep_exit - unregisters SpeedStep support
- *
- * Unregisters SpeedStep support.
- */
-static void __exit speedstep_exit(void)
-{
- cpufreq_unregister_driver(&speedstep_driver);
-}
-
-module_param(smi_port, int, 0444);
-module_param(smi_cmd, int, 0444);
-module_param(smi_sig, uint, 0444);
-
-MODULE_PARM_DESC(smi_port, "Override the BIOS-given IST port with this value "
- "-- Intel's default setting is 0xb2");
-MODULE_PARM_DESC(smi_cmd, "Override the BIOS-given IST command with this value "
- "-- Intel's default setting is 0x82");
-MODULE_PARM_DESC(smi_sig, "Set to 1 to fake the IST signature when using the "
- "SMI interface.");
-
-MODULE_AUTHOR("Hiroshi Miura");
-MODULE_DESCRIPTION("Speedstep driver for IST applet SMI interface.");
-MODULE_LICENSE("GPL");
-
-module_init(speedstep_init);
-module_exit(speedstep_exit);
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 8095f8611f8..755f64fb074 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -32,11 +32,11 @@
*/
static const __initconst struct hypervisor_x86 * const hypervisors[] =
{
- &x86_hyper_vmware,
- &x86_hyper_ms_hyperv,
#ifdef CONFIG_XEN_PVHVM
&x86_hyper_xen_hvm,
#endif
+ &x86_hyper_vmware,
+ &x86_hyper_ms_hyperv,
};
const struct hypervisor_x86 *x86_hyper;
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index df86bc8c859..ed6086eedf1 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -29,10 +29,10 @@
static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
{
+ u64 misc_enable;
+
/* Unmask CPUID levels if masked: */
if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
- u64 misc_enable;
-
rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) {
@@ -118,8 +118,6 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
* (model 2) with the same problem.
*/
if (c->x86 == 15) {
- u64 misc_enable;
-
rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) {
@@ -130,6 +128,19 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
}
}
#endif
+
+ /*
+ * If fast string is not enabled in IA32_MISC_ENABLE for any reason,
+ * clear the fast string and enhanced fast string CPU capabilities.
+ */
+ if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
+ rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
+ if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
+ printk(KERN_INFO "Disabled fast string operations\n");
+ setup_clear_cpu_cap(X86_FEATURE_REP_GOOD);
+ setup_clear_cpu_cap(X86_FEATURE_ERMS);
+ }
+ }
}
#ifdef CONFIG_X86_32
@@ -400,12 +411,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
switch (c->x86_model) {
case 5:
- if (c->x86_mask == 0) {
- if (l2 == 0)
- p = "Celeron (Covington)";
- else if (l2 == 256)
- p = "Mobile Pentium II (Dixon)";
- }
+ if (l2 == 0)
+ p = "Celeron (Covington)";
+ else if (l2 == 256)
+ p = "Mobile Pentium II (Dixon)";
break;
case 6:
@@ -447,6 +456,24 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_VMX))
detect_vmx_virtcap(c);
+
+ /*
+ * Initialize MSR_IA32_ENERGY_PERF_BIAS if BIOS did not.
+ * x86_energy_perf_policy(8) is available to change it at run-time
+ */
+ if (cpu_has(c, X86_FEATURE_EPB)) {
+ u64 epb;
+
+ rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
+ if ((epb & 0xF) == ENERGY_PERF_BIAS_PERFORMANCE) {
+ printk_once(KERN_WARNING "ENERGY_PERF_BIAS:"
+ " Set to 'normal', was 'performance'\n"
+ "ENERGY_PERF_BIAS: View and update with"
+ " x86_energy_perf_policy(8)\n");
+ epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL;
+ wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
+ }
+ }
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 1ce1af2899d..c105c533ed9 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -327,7 +327,6 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9));
l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
- l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
}
@@ -454,27 +453,16 @@ int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot,
{
int ret = 0;
-#define SUBCACHE_MASK (3UL << 20)
-#define SUBCACHE_INDEX 0xfff
-
- /*
- * check whether this slot is already used or
- * the index is already disabled
- */
+ /* check if @slot is already used or the index is already disabled */
ret = amd_get_l3_disable_slot(l3, slot);
if (ret >= 0)
return -EINVAL;
- /*
- * check whether the other slot has disabled the
- * same index already
- */
- if (index == amd_get_l3_disable_slot(l3, !slot))
+ if (index > l3->indices)
return -EINVAL;
- /* do not allow writes outside of allowed bits */
- if ((index & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
- ((index & SUBCACHE_INDEX) > l3->indices))
+ /* check whether the other slot has disabled the same index already */
+ if (index == amd_get_l3_disable_slot(l3, !slot))
return -EINVAL;
amd_l3_disable_index(l3, cpu, slot, index);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 1e8d66c1336..7395d5f4272 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -43,61 +43,105 @@ static struct severity {
unsigned char covered;
char *msg;
} severities[] = {
-#define KERNEL .context = IN_KERNEL
-#define USER .context = IN_USER
-#define SER .ser = SER_REQUIRED
-#define NOSER .ser = NO_SER
-#define SEV(s) .sev = MCE_ ## s ## _SEVERITY
-#define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r }
-#define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r }
-#define MCGMASK(x, res, s, m, r...) \
- { .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r }
-#define MASK(x, y, s, m, r...) \
- { .mask = x, .result = y, SEV(s), .msg = m, ## r }
+#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
+#define KERNEL .context = IN_KERNEL
+#define USER .context = IN_USER
+#define SER .ser = SER_REQUIRED
+#define NOSER .ser = NO_SER
+#define BITCLR(x) .mask = x, .result = 0
+#define BITSET(x) .mask = x, .result = x
+#define MCGMASK(x, y) .mcgmask = x, .mcgres = y
+#define MASK(x, y) .mask = x, .result = y
#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
#define MCACOD 0xffff
- BITCLR(MCI_STATUS_VAL, NO, "Invalid"),
- BITCLR(MCI_STATUS_EN, NO, "Not enabled"),
- BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"),
+ MCESEV(
+ NO, "Invalid",
+ BITCLR(MCI_STATUS_VAL)
+ ),
+ MCESEV(
+ NO, "Not enabled",
+ BITCLR(MCI_STATUS_EN)
+ ),
+ MCESEV(
+ PANIC, "Processor context corrupt",
+ BITSET(MCI_STATUS_PCC)
+ ),
/* When MCIP is not set something is very confused */
- MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"),
+ MCESEV(
+ PANIC, "MCIP not set in MCA handler",
+ MCGMASK(MCG_STATUS_MCIP, 0)
+ ),
/* Neither return not error IP -- no chance to recover -> PANIC */
- MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC,
- "Neither restart nor error IP"),
- MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP",
- KERNEL),
- BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER),
- MASK(MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN, MCI_STATUS_UC, SOME,
- "Spurious not enabled", SER),
+ MCESEV(
+ PANIC, "Neither restart nor error IP",
+ MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0)
+ ),
+ MCESEV(
+ PANIC, "In kernel and no restart IP",
+ KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
+ ),
+ MCESEV(
+ KEEP, "Corrected error",
+ NOSER, BITCLR(MCI_STATUS_UC)
+ ),
/* ignore OVER for UCNA */
- MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP,
- "Uncorrected no action required", SER),
- MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, PANIC,
- "Illegal combination (UCNA with AR=1)", SER),
- MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER),
+ MCESEV(
+ KEEP, "Uncorrected no action required",
+ SER, MASK(MCI_UC_SAR, MCI_STATUS_UC)
+ ),
+ MCESEV(
+ PANIC, "Illegal combination (UCNA with AR=1)",
+ SER,
+ MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR)
+ ),
+ MCESEV(
+ KEEP, "Non signalled machine check",
+ SER, BITCLR(MCI_STATUS_S)
+ ),
/* AR add known MCACODs here */
- MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC,
- "Action required with lost events", SER),
- MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_SAR, PANIC,
- "Action required; unknown MCACOD", SER),
+ MCESEV(
+ PANIC, "Action required with lost events",
+ SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR)
+ ),
+ MCESEV(
+ PANIC, "Action required: unknown MCACOD",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
+ ),
/* known AO MCACODs: */
- MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, AO,
- "Action optional: memory scrubbing error", SER),
- MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO,
- "Action optional: last level cache writeback error", SER),
-
- MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME,
- "Action optional unknown MCACOD", SER),
- MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME,
- "Action optional with lost events", SER),
- BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"),
- BITSET(MCI_STATUS_UC, UC, "Uncorrected"),
- BITSET(0, SOME, "No match") /* always matches. keep at end */
+ MCESEV(
+ AO, "Action optional: memory scrubbing error",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|0xfff0, MCI_UC_S|0x00c0)
+ ),
+ MCESEV(
+ AO, "Action optional: last level cache writeback error",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|0x017a)
+ ),
+ MCESEV(
+ SOME, "Action optional: unknown MCACOD",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)
+ ),
+ MCESEV(
+ SOME, "Action optional with lost events",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_S)
+ ),
+
+ MCESEV(
+ PANIC, "Overflowed uncorrected",
+ BITSET(MCI_STATUS_OVER|MCI_STATUS_UC)
+ ),
+ MCESEV(
+ UC, "Uncorrected",
+ BITSET(MCI_STATUS_UC)
+ ),
+ MCESEV(
+ SOME, "No match",
+ BITSET(0)
+ ) /* always matches. keep at end */
};
/*
@@ -112,15 +156,15 @@ static int error_context(struct mce *m)
return IN_KERNEL;
}
-int mce_severity(struct mce *a, int tolerant, char **msg)
+int mce_severity(struct mce *m, int tolerant, char **msg)
{
- enum context ctx = error_context(a);
+ enum context ctx = error_context(m);
struct severity *s;
for (s = severities;; s++) {
- if ((a->status & s->mask) != s->result)
+ if ((m->status & s->mask) != s->result)
continue;
- if ((a->mcgstatus & s->mcgmask) != s->mcgres)
+ if ((m->mcgstatus & s->mcgmask) != s->mcgres)
continue;
if (s->ser == SER_REQUIRED && !mce_ser)
continue;
@@ -197,15 +241,15 @@ static const struct file_operations severities_coverage_fops = {
static int __init severities_debugfs_init(void)
{
- struct dentry *dmce = NULL, *fseverities_coverage = NULL;
+ struct dentry *dmce, *fsev;
dmce = mce_get_debugfs_dir();
- if (dmce == NULL)
+ if (!dmce)
goto err_out;
- fseverities_coverage = debugfs_create_file("severities-coverage",
- 0444, dmce, NULL,
- &severities_coverage_fops);
- if (fseverities_coverage == NULL)
+
+ fsev = debugfs_create_file("severities-coverage", 0444, dmce, NULL,
+ &severities_coverage_fops);
+ if (!fsev)
goto err_out;
return 0;
@@ -214,4 +258,4 @@ err_out:
return -ENOMEM;
}
late_initcall(severities_debugfs_init);
-#endif
+#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 3385ea26f68..08363b04212 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -10,7 +10,6 @@
#include <linux/thread_info.h>
#include <linux/capability.h>
#include <linux/miscdevice.h>
-#include <linux/interrupt.h>
#include <linux/ratelimit.h>
#include <linux/kallsyms.h>
#include <linux/rcupdate.h>
@@ -38,23 +37,20 @@
#include <linux/mm.h>
#include <linux/debugfs.h>
#include <linux/edac_mce.h>
+#include <linux/irq_work.h>
#include <asm/processor.h>
-#include <asm/hw_irq.h>
-#include <asm/apic.h>
-#include <asm/idle.h>
-#include <asm/ipi.h>
#include <asm/mce.h>
#include <asm/msr.h>
#include "mce-internal.h"
-static DEFINE_MUTEX(mce_read_mutex);
+static DEFINE_MUTEX(mce_chrdev_read_mutex);
#define rcu_dereference_check_mce(p) \
rcu_dereference_index_check((p), \
rcu_read_lock_sched_held() || \
- lockdep_is_held(&mce_read_mutex))
+ lockdep_is_held(&mce_chrdev_read_mutex))
#define CREATE_TRACE_POINTS
#include <trace/events/mce.h>
@@ -94,7 +90,8 @@ static unsigned long mce_need_notify;
static char mce_helper[128];
static char *mce_helper_argv[2] = { mce_helper, NULL };
-static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
+static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
+
static DEFINE_PER_CPU(struct mce, mces_seen);
static int cpu_missing;
@@ -105,20 +102,6 @@ static int cpu_missing;
ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
-static int default_decode_mce(struct notifier_block *nb, unsigned long val,
- void *data)
-{
- pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n");
- pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n");
-
- return NOTIFY_STOP;
-}
-
-static struct notifier_block mce_dec_nb = {
- .notifier_call = default_decode_mce,
- .priority = -1,
-};
-
/* MCA banks polled by the period polling timer for corrected events */
DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
@@ -212,6 +195,8 @@ void mce_log(struct mce *mce)
static void print_mce(struct mce *m)
{
+ int ret = 0;
+
pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
m->extcpu, m->mcgstatus, m->bank, m->status);
@@ -239,7 +224,11 @@ static void print_mce(struct mce *m)
* Print out human-readable details about the MCE error,
* (if the CPU has an implementation for that)
*/
- atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
+ ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
+ if (ret == NOTIFY_STOP)
+ return;
+
+ pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
}
#define PANIC_TIMEOUT 5 /* 5 seconds */
@@ -381,6 +370,31 @@ static void mce_wrmsrl(u32 msr, u64 v)
}
/*
+ * Collect all global (w.r.t. this processor) status about this machine
+ * check into our "mce" struct so that we can use it later to assess
+ * the severity of the problem as we read per-bank specific details.
+ */
+static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
+{
+ mce_setup(m);
+
+ m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ if (regs) {
+ /*
+ * Get the address of the instruction at the time of
+ * the machine check error.
+ */
+ if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
+ m->ip = regs->ip;
+ m->cs = regs->cs;
+ }
+ /* Use accurate RIP reporting if available. */
+ if (rip_msr)
+ m->ip = mce_rdmsrl(rip_msr);
+ }
+}
+
+/*
* Simple lockless ring to communicate PFNs from the exception handler with the
* process context work function. This is vastly simplified because there's
* only a single reader and a single writer.
@@ -451,40 +465,13 @@ static void mce_schedule_work(void)
}
}
-/*
- * Get the address of the instruction at the time of the machine check
- * error.
- */
-static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
-{
+DEFINE_PER_CPU(struct irq_work, mce_irq_work);
- if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) {
- m->ip = regs->ip;
- m->cs = regs->cs;
- } else {
- m->ip = 0;
- m->cs = 0;
- }
- if (rip_msr)
- m->ip = mce_rdmsrl(rip_msr);
-}
-
-#ifdef CONFIG_X86_LOCAL_APIC
-/*
- * Called after interrupts have been reenabled again
- * when a MCE happened during an interrupts off region
- * in the kernel.
- */
-asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
+static void mce_irq_work_cb(struct irq_work *entry)
{
- ack_APIC_irq();
- exit_idle();
- irq_enter();
mce_notify_irq();
mce_schedule_work();
- irq_exit();
}
-#endif
static void mce_report_event(struct pt_regs *regs)
{
@@ -500,29 +487,7 @@ static void mce_report_event(struct pt_regs *regs)
return;
}
-#ifdef CONFIG_X86_LOCAL_APIC
- /*
- * Without APIC do not notify. The event will be picked
- * up eventually.
- */
- if (!cpu_has_apic)
- return;
-
- /*
- * When interrupts are disabled we cannot use
- * kernel services safely. Trigger an self interrupt
- * through the APIC to instead do the notification
- * after interrupts are reenabled again.
- */
- apic->send_IPI_self(MCE_SELF_VECTOR);
-
- /*
- * Wait for idle afterwards again so that we don't leave the
- * APIC in a non idle state because the normal APIC writes
- * cannot exclude us.
- */
- apic_wait_icr_idle();
-#endif
+ irq_work_queue(&__get_cpu_var(mce_irq_work));
}
DEFINE_PER_CPU(unsigned, mce_poll_count);
@@ -549,9 +514,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
percpu_inc(mce_poll_count);
- mce_setup(&m);
+ mce_gather_info(&m, NULL);
- m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
for (i = 0; i < banks; i++) {
if (!mce_banks[i].ctl || !test_bit(i, *b))
continue;
@@ -590,7 +554,6 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
mce_log(&m);
atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m);
- add_taint(TAINT_MACHINE_CHECK);
}
/*
@@ -888,9 +851,9 @@ static int mce_usable_address(struct mce *m)
{
if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
return 0;
- if ((m->misc & 0x3f) > PAGE_SHIFT)
+ if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
return 0;
- if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)
+ if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
return 0;
return 1;
}
@@ -951,9 +914,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
if (!banks)
goto out;
- mce_setup(&m);
+ mce_gather_info(&m, regs);
- m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
final = &__get_cpu_var(mces_seen);
*final = m;
@@ -1037,7 +999,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
mce_ring_add(m.addr >> PAGE_SHIFT);
- mce_get_rip(&m, regs);
mce_log(&m);
if (severity > worst) {
@@ -1199,7 +1160,8 @@ int mce_notify_irq(void)
clear_thread_flag(TIF_MCE_NOTIFY);
if (test_and_clear_bit(0, &mce_need_notify)) {
- wake_up_interruptible(&mce_wait);
+ /* wake processes polling /dev/mcelog */
+ wake_up_interruptible(&mce_chrdev_wait);
/*
* There is no risk of missing notifications because
@@ -1372,18 +1334,23 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
return 0;
}
-static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
+static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
{
if (c->x86 != 5)
- return;
+ return 0;
+
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
intel_p5_mcheck_init(c);
+ return 1;
break;
case X86_VENDOR_CENTAUR:
winchip_mcheck_init(c);
+ return 1;
break;
}
+
+ return 0;
}
static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
@@ -1437,7 +1404,8 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
if (mce_disabled)
return;
- __mcheck_cpu_ancient_init(c);
+ if (__mcheck_cpu_ancient_init(c))
+ return;
if (!mce_available(c))
return;
@@ -1453,44 +1421,45 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
__mcheck_cpu_init_vendor(c);
__mcheck_cpu_init_timer();
INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
-
+ init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb);
}
/*
- * Character device to read and clear the MCE log.
+ * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
*/
-static DEFINE_SPINLOCK(mce_state_lock);
-static int open_count; /* #times opened */
-static int open_exclu; /* already open exclusive? */
+static DEFINE_SPINLOCK(mce_chrdev_state_lock);
+static int mce_chrdev_open_count; /* #times opened */
+static int mce_chrdev_open_exclu; /* already open exclusive? */
-static int mce_open(struct inode *inode, struct file *file)
+static int mce_chrdev_open(struct inode *inode, struct file *file)
{
- spin_lock(&mce_state_lock);
+ spin_lock(&mce_chrdev_state_lock);
- if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
- spin_unlock(&mce_state_lock);
+ if (mce_chrdev_open_exclu ||
+ (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
+ spin_unlock(&mce_chrdev_state_lock);
return -EBUSY;
}
if (file->f_flags & O_EXCL)
- open_exclu = 1;
- open_count++;
+ mce_chrdev_open_exclu = 1;
+ mce_chrdev_open_count++;
- spin_unlock(&mce_state_lock);
+ spin_unlock(&mce_chrdev_state_lock);
return nonseekable_open(inode, file);
}
-static int mce_release(struct inode *inode, struct file *file)
+static int mce_chrdev_release(struct inode *inode, struct file *file)
{
- spin_lock(&mce_state_lock);
+ spin_lock(&mce_chrdev_state_lock);
- open_count--;
- open_exclu = 0;
+ mce_chrdev_open_count--;
+ mce_chrdev_open_exclu = 0;
- spin_unlock(&mce_state_lock);
+ spin_unlock(&mce_chrdev_state_lock);
return 0;
}
@@ -1539,8 +1508,8 @@ static int __mce_read_apei(char __user **ubuf, size_t usize)
return 0;
}
-static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
- loff_t *off)
+static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
+ size_t usize, loff_t *off)
{
char __user *buf = ubuf;
unsigned long *cpu_tsc;
@@ -1551,7 +1520,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
if (!cpu_tsc)
return -ENOMEM;
- mutex_lock(&mce_read_mutex);
+ mutex_lock(&mce_chrdev_read_mutex);
if (!mce_apei_read_done) {
err = __mce_read_apei(&buf, usize);
@@ -1571,19 +1540,18 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
do {
for (i = prev; i < next; i++) {
unsigned long start = jiffies;
+ struct mce *m = &mcelog.entry[i];
- while (!mcelog.entry[i].finished) {
+ while (!m->finished) {
if (time_after_eq(jiffies, start + 2)) {
- memset(mcelog.entry + i, 0,
- sizeof(struct mce));
+ memset(m, 0, sizeof(*m));
goto timeout;
}
cpu_relax();
}
smp_rmb();
- err |= copy_to_user(buf, mcelog.entry + i,
- sizeof(struct mce));
- buf += sizeof(struct mce);
+ err |= copy_to_user(buf, m, sizeof(*m));
+ buf += sizeof(*m);
timeout:
;
}
@@ -1603,13 +1571,13 @@ timeout:
on_each_cpu(collect_tscs, cpu_tsc, 1);
for (i = next; i < MCE_LOG_LEN; i++) {
- if (mcelog.entry[i].finished &&
- mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
- err |= copy_to_user(buf, mcelog.entry+i,
- sizeof(struct mce));
+ struct mce *m = &mcelog.entry[i];
+
+ if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
+ err |= copy_to_user(buf, m, sizeof(*m));
smp_rmb();
- buf += sizeof(struct mce);
- memset(&mcelog.entry[i], 0, sizeof(struct mce));
+ buf += sizeof(*m);
+ memset(m, 0, sizeof(*m));
}
}
@@ -1617,15 +1585,15 @@ timeout:
err = -EFAULT;
out:
- mutex_unlock(&mce_read_mutex);
+ mutex_unlock(&mce_chrdev_read_mutex);
kfree(cpu_tsc);
return err ? err : buf - ubuf;
}
-static unsigned int mce_poll(struct file *file, poll_table *wait)
+static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
{
- poll_wait(file, &mce_wait, wait);
+ poll_wait(file, &mce_chrdev_wait, wait);
if (rcu_access_index(mcelog.next))
return POLLIN | POLLRDNORM;
if (!mce_apei_read_done && apei_check_mce())
@@ -1633,7 +1601,8 @@ static unsigned int mce_poll(struct file *file, poll_table *wait)
return 0;
}
-static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
+static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
+ unsigned long arg)
{
int __user *p = (int __user *)arg;
@@ -1661,16 +1630,16 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
/* Modified in mce-inject.c, so not static or const */
struct file_operations mce_chrdev_ops = {
- .open = mce_open,
- .release = mce_release,
- .read = mce_read,
- .poll = mce_poll,
- .unlocked_ioctl = mce_ioctl,
- .llseek = no_llseek,
+ .open = mce_chrdev_open,
+ .release = mce_chrdev_release,
+ .read = mce_chrdev_read,
+ .poll = mce_chrdev_poll,
+ .unlocked_ioctl = mce_chrdev_ioctl,
+ .llseek = no_llseek,
};
EXPORT_SYMBOL_GPL(mce_chrdev_ops);
-static struct miscdevice mce_log_device = {
+static struct miscdevice mce_chrdev_device = {
MISC_MCELOG_MINOR,
"mcelog",
&mce_chrdev_ops,
@@ -1722,15 +1691,13 @@ __setup("mce", mcheck_enable);
int __init mcheck_init(void)
{
- atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb);
-
mcheck_intel_therm_init();
return 0;
}
/*
- * Sysfs support
+ * mce_syscore: PM support
*/
/*
@@ -1750,12 +1717,12 @@ static int mce_disable_error_reporting(void)
return 0;
}
-static int mce_suspend(void)
+static int mce_syscore_suspend(void)
{
return mce_disable_error_reporting();
}
-static void mce_shutdown(void)
+static void mce_syscore_shutdown(void)
{
mce_disable_error_reporting();
}
@@ -1765,18 +1732,22 @@ static void mce_shutdown(void)
* Only one CPU is active at this time, the others get re-added later using
* CPU hotplug:
*/
-static void mce_resume(void)
+static void mce_syscore_resume(void)
{
__mcheck_cpu_init_generic();
__mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info));
}
static struct syscore_ops mce_syscore_ops = {
- .suspend = mce_suspend,
- .shutdown = mce_shutdown,
- .resume = mce_resume,
+ .suspend = mce_syscore_suspend,
+ .shutdown = mce_syscore_shutdown,
+ .resume = mce_syscore_resume,
};
+/*
+ * mce_sysdev: Sysfs support
+ */
+
static void mce_cpu_restart(void *data)
{
del_timer_sync(&__get_cpu_var(mce_timer));
@@ -1812,11 +1783,11 @@ static void mce_enable_ce(void *all)
__mcheck_cpu_init_timer();
}
-static struct sysdev_class mce_sysclass = {
+static struct sysdev_class mce_sysdev_class = {
.name = "machinecheck",
};
-DEFINE_PER_CPU(struct sys_device, mce_dev);
+DEFINE_PER_CPU(struct sys_device, mce_sysdev);
__cpuinitdata
void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
@@ -1945,7 +1916,7 @@ static struct sysdev_ext_attribute attr_cmci_disabled = {
&mce_cmci_disabled
};
-static struct sysdev_attribute *mce_attrs[] = {
+static struct sysdev_attribute *mce_sysdev_attrs[] = {
&attr_tolerant.attr,
&attr_check_interval.attr,
&attr_trigger,
@@ -1956,66 +1927,67 @@ static struct sysdev_attribute *mce_attrs[] = {
NULL
};
-static cpumask_var_t mce_dev_initialized;
+static cpumask_var_t mce_sysdev_initialized;
/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
-static __cpuinit int mce_create_device(unsigned int cpu)
+static __cpuinit int mce_sysdev_create(unsigned int cpu)
{
+ struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
int err;
int i, j;
if (!mce_available(&boot_cpu_data))
return -EIO;
- memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
- per_cpu(mce_dev, cpu).id = cpu;
- per_cpu(mce_dev, cpu).cls = &mce_sysclass;
+ memset(&sysdev->kobj, 0, sizeof(struct kobject));
+ sysdev->id = cpu;
+ sysdev->cls = &mce_sysdev_class;
- err = sysdev_register(&per_cpu(mce_dev, cpu));
+ err = sysdev_register(sysdev);
if (err)
return err;
- for (i = 0; mce_attrs[i]; i++) {
- err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
+ for (i = 0; mce_sysdev_attrs[i]; i++) {
+ err = sysdev_create_file(sysdev, mce_sysdev_attrs[i]);
if (err)
goto error;
}
for (j = 0; j < banks; j++) {
- err = sysdev_create_file(&per_cpu(mce_dev, cpu),
- &mce_banks[j].attr);
+ err = sysdev_create_file(sysdev, &mce_banks[j].attr);
if (err)
goto error2;
}
- cpumask_set_cpu(cpu, mce_dev_initialized);
+ cpumask_set_cpu(cpu, mce_sysdev_initialized);
return 0;
error2:
while (--j >= 0)
- sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr);
+ sysdev_remove_file(sysdev, &mce_banks[j].attr);
error:
while (--i >= 0)
- sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
+ sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
- sysdev_unregister(&per_cpu(mce_dev, cpu));
+ sysdev_unregister(sysdev);
return err;
}
-static __cpuinit void mce_remove_device(unsigned int cpu)
+static __cpuinit void mce_sysdev_remove(unsigned int cpu)
{
+ struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
int i;
- if (!cpumask_test_cpu(cpu, mce_dev_initialized))
+ if (!cpumask_test_cpu(cpu, mce_sysdev_initialized))
return;
- for (i = 0; mce_attrs[i]; i++)
- sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
+ for (i = 0; mce_sysdev_attrs[i]; i++)
+ sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
for (i = 0; i < banks; i++)
- sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);
+ sysdev_remove_file(sysdev, &mce_banks[i].attr);
- sysdev_unregister(&per_cpu(mce_dev, cpu));
- cpumask_clear_cpu(cpu, mce_dev_initialized);
+ sysdev_unregister(sysdev);
+ cpumask_clear_cpu(cpu, mce_sysdev_initialized);
}
/* Make sure there are no machine checks on offlined CPUs. */
@@ -2065,7 +2037,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
- mce_create_device(cpu);
+ mce_sysdev_create(cpu);
if (threshold_cpu_callback)
threshold_cpu_callback(action, cpu);
break;
@@ -2073,7 +2045,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
case CPU_DEAD_FROZEN:
if (threshold_cpu_callback)
threshold_cpu_callback(action, cpu);
- mce_remove_device(cpu);
+ mce_sysdev_remove(cpu);
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
@@ -2127,27 +2099,28 @@ static __init int mcheck_init_device(void)
if (!mce_available(&boot_cpu_data))
return -EIO;
- zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
+ zalloc_cpumask_var(&mce_sysdev_initialized, GFP_KERNEL);
mce_init_banks();
- err = sysdev_class_register(&mce_sysclass);
+ err = sysdev_class_register(&mce_sysdev_class);
if (err)
return err;
for_each_online_cpu(i) {
- err = mce_create_device(i);
+ err = mce_sysdev_create(i);
if (err)
return err;
}
register_syscore_ops(&mce_syscore_ops);
register_hotcpu_notifier(&mce_cpu_notifier);
- misc_register(&mce_log_device);
+
+ /* register character device /dev/mcelog */
+ misc_register(&mce_chrdev_device);
return err;
}
-
device_initcall(mcheck_init_device);
/*
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 167f97b5596..f5474218cff 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -509,6 +509,7 @@ recurse:
out_free:
if (b) {
kobject_put(&b->kobj);
+ list_del(&b->miscj);
kfree(b);
}
return err;
@@ -547,7 +548,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
if (!b)
goto out;
- err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj,
+ err = sysfs_create_link(&per_cpu(mce_sysdev, cpu).kobj,
b->kobj, name);
if (err)
goto out;
@@ -570,7 +571,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
goto out;
}
- b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj);
+ b->kobj = kobject_create_and_add(name, &per_cpu(mce_sysdev, cpu).kobj);
if (!b->kobj)
goto out_free;
@@ -590,7 +591,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
if (i == cpu)
continue;
- err = sysfs_create_link(&per_cpu(mce_dev, i).kobj,
+ err = sysfs_create_link(&per_cpu(mce_sysdev, i).kobj,
b->kobj, name);
if (err)
goto out;
@@ -668,7 +669,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
#ifdef CONFIG_SMP
/* sibling symlink */
if (shared_bank[bank] && b->blocks->cpu != cpu) {
- sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name);
+ sysfs_remove_link(&per_cpu(mce_sysdev, cpu).kobj, name);
per_cpu(threshold_banks, cpu)[bank] = NULL;
return;
@@ -680,7 +681,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
if (i == cpu)
continue;
- sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name);
+ sysfs_remove_link(&per_cpu(mce_sysdev, i).kobj, name);
per_cpu(threshold_banks, i)[bank] = NULL;
}
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 6f8c5e9da97..27c625178bf 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -187,8 +187,6 @@ static int therm_throt_process(bool new_event, int event, int level)
this_cpu,
level == CORE_LEVEL ? "Core" : "Package",
state->count);
-
- add_taint(TAINT_MACHINE_CHECK);
return 1;
}
if (old_event) {
@@ -355,7 +353,6 @@ static void notify_thresholds(__u64 msr_val)
static void intel_thermal_interrupt(void)
{
__u64 msr_val;
- struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
@@ -367,19 +364,19 @@ static void intel_thermal_interrupt(void)
CORE_LEVEL) != 0)
mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
- if (cpu_has(c, X86_FEATURE_PLN))
+ if (this_cpu_has(X86_FEATURE_PLN))
if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
POWER_LIMIT_EVENT,
CORE_LEVEL) != 0)
mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
- if (cpu_has(c, X86_FEATURE_PTS)) {
+ if (this_cpu_has(X86_FEATURE_PTS)) {
rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
THERMAL_THROTTLING_EVENT,
PACKAGE_LEVEL) != 0)
mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
- if (cpu_has(c, X86_FEATURE_PLN))
+ if (this_cpu_has(X86_FEATURE_PLN))
if (therm_throt_process(msr_val &
PACKAGE_THERM_STATUS_POWER_LIMIT,
POWER_LIMIT_EVENT,
@@ -393,7 +390,6 @@ static void unexpected_thermal_interrupt(void)
{
printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n",
smp_processor_id());
- add_taint(TAINT_MACHINE_CHECK);
}
static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
@@ -446,18 +442,20 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
*/
rdmsr(MSR_IA32_MISC_ENABLE, l, h);
+ h = lvtthmr_init;
/*
* The initial value of thermal LVT entries on all APs always reads
* 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
* sequence to them and LVT registers are reset to 0s except for
* the mask bits which are set to 1s when APs receive INIT IPI.
- * Always restore the value that BIOS has programmed on AP based on
- * BSP's info we saved since BIOS is always setting the same value
- * for all threads/cores
+ * If BIOS takes over the thermal interrupt and sets its interrupt
+ * delivery mode to SMI (not fixed), it restores the value that the
+ * BIOS has programmed on AP based on BSP's info we saved since BIOS
+ * is always setting the same value for all threads/cores.
*/
- apic_write(APIC_LVTTHMR, lvtthmr_init);
+ if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED)
+ apic_write(APIC_LVTTHMR, lvtthmr_init);
- h = lvtthmr_init;
if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
printk(KERN_DEBUG
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 929739a653d..08119a37e53 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -79,7 +79,6 @@ void set_mtrr_ops(const struct mtrr_ops *ops)
static int have_wrcomb(void)
{
struct pci_dev *dev;
- u8 rev;
dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL);
if (dev != NULL) {
@@ -89,13 +88,11 @@ static int have_wrcomb(void)
* chipsets to be tagged
*/
if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS &&
- dev->device == PCI_DEVICE_ID_SERVERWORKS_LE) {
- pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
- if (rev <= 5) {
- pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n");
- pci_dev_put(dev);
- return 0;
- }
+ dev->device == PCI_DEVICE_ID_SERVERWORKS_LE &&
+ dev->revision <= 5) {
+ pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n");
+ pci_dev_put(dev);
+ return 0;
}
/*
* Intel 450NX errata # 23. Non ascending cacheline evictions to
@@ -137,55 +134,43 @@ static void __init init_table(void)
}
struct set_mtrr_data {
- atomic_t count;
- atomic_t gate;
unsigned long smp_base;
unsigned long smp_size;
unsigned int smp_reg;
mtrr_type smp_type;
};
-static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work);
-
/**
- * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs.
+ * mtrr_rendezvous_handler - Work done in the synchronization handler. Executed
+ * by all the CPUs.
* @info: pointer to mtrr configuration data
*
* Returns nothing.
*/
-static int mtrr_work_handler(void *info)
+static int mtrr_rendezvous_handler(void *info)
{
#ifdef CONFIG_SMP
struct set_mtrr_data *data = info;
- unsigned long flags;
-
- atomic_dec(&data->count);
- while (!atomic_read(&data->gate))
- cpu_relax();
-
- local_irq_save(flags);
-
- atomic_dec(&data->count);
- while (atomic_read(&data->gate))
- cpu_relax();
- /* The master has cleared me to execute */
+ /*
+ * We use this same function to initialize the mtrrs during boot,
+ * resume, runtime cpu online and on an explicit request to set a
+ * specific MTRR.
+ *
+ * During boot or suspend, the state of the boot cpu's mtrrs has been
+ * saved, and we want to replicate that across all the cpus that come
+ * online (either at the end of boot or resume or during a runtime cpu
+ * online). If we're doing that, @reg is set to something special and on
+ * all the cpu's we do mtrr_if->set_all() (On the logical cpu that
+ * started the boot/resume sequence, this might be a duplicate
+ * set_all()).
+ */
if (data->smp_reg != ~0U) {
mtrr_if->set(data->smp_reg, data->smp_base,
data->smp_size, data->smp_type);
- } else if (mtrr_aps_delayed_init) {
- /*
- * Initialize the MTRRs inaddition to the synchronisation.
- */
+ } else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) {
mtrr_if->set_all();
}
-
- atomic_dec(&data->count);
- while (!atomic_read(&data->gate))
- cpu_relax();
-
- atomic_dec(&data->count);
- local_irq_restore(flags);
#endif
return 0;
}
@@ -223,20 +208,11 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
* 14. Wait for buddies to catch up
* 15. Enable interrupts.
*
- * What does that mean for us? Well, first we set data.count to the number
- * of CPUs. As each CPU announces that it started the rendezvous handler by
- * decrementing the count, We reset data.count and set the data.gate flag
- * allowing all the cpu's to proceed with the work. As each cpu disables
- * interrupts, it'll decrement data.count once. We wait until it hits 0 and
- * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they
- * are waiting for that flag to be cleared. Once it's cleared, each
- * CPU goes through the transition of updating MTRRs.
- * The CPU vendors may each do it differently,
- * so we call mtrr_if->set() callback and let them take care of it.
- * When they're done, they again decrement data->count and wait for data.gate
- * to be set.
- * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag
- * Everyone then enables interrupts and we all continue on.
+ * What does that mean for us? Well, stop_machine() will ensure that
+ * the rendezvous handler is started on each CPU. And in lockstep they
+ * do the state transition of disabling interrupts, updating MTRR's
+ * (the CPU vendors may each do it differently, so we call mtrr_if->set()
+ * callback and let them take care of it.) and enabling interrupts.
*
* Note that the mechanism is the same for UP systems, too; all the SMP stuff
* becomes nops.
@@ -244,92 +220,26 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
static void
set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type)
{
- struct set_mtrr_data data;
- unsigned long flags;
- int cpu;
-
- preempt_disable();
-
- data.smp_reg = reg;
- data.smp_base = base;
- data.smp_size = size;
- data.smp_type = type;
- atomic_set(&data.count, num_booting_cpus() - 1);
-
- /* Make sure data.count is visible before unleashing other CPUs */
- smp_wmb();
- atomic_set(&data.gate, 0);
-
- /* Start the ball rolling on other CPUs */
- for_each_online_cpu(cpu) {
- struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu);
-
- if (cpu == smp_processor_id())
- continue;
-
- stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work);
- }
-
-
- while (atomic_read(&data.count))
- cpu_relax();
-
- /* Ok, reset count and toggle gate */
- atomic_set(&data.count, num_booting_cpus() - 1);
- smp_wmb();
- atomic_set(&data.gate, 1);
-
- local_irq_save(flags);
-
- while (atomic_read(&data.count))
- cpu_relax();
-
- /* Ok, reset count and toggle gate */
- atomic_set(&data.count, num_booting_cpus() - 1);
- smp_wmb();
- atomic_set(&data.gate, 0);
-
- /* Do our MTRR business */
-
- /*
- * HACK!
- *
- * We use this same function to initialize the mtrrs during boot,
- * resume, runtime cpu online and on an explicit request to set a
- * specific MTRR.
- *
- * During boot or suspend, the state of the boot cpu's mtrrs has been
- * saved, and we want to replicate that across all the cpus that come
- * online (either at the end of boot or resume or during a runtime cpu
- * online). If we're doing that, @reg is set to something special and on
- * this cpu we still do mtrr_if->set_all(). During boot/resume, this
- * is unnecessary if at this point we are still on the cpu that started
- * the boot/resume sequence. But there is no guarantee that we are still
- * on the same cpu. So we do mtrr_if->set_all() on this cpu aswell to be
- * sure that we are in sync with everyone else.
- */
- if (reg != ~0U)
- mtrr_if->set(reg, base, size, type);
- else
- mtrr_if->set_all();
+ struct set_mtrr_data data = { .smp_reg = reg,
+ .smp_base = base,
+ .smp_size = size,
+ .smp_type = type
+ };
- /* Wait for the others */
- while (atomic_read(&data.count))
- cpu_relax();
-
- atomic_set(&data.count, num_booting_cpus() - 1);
- smp_wmb();
- atomic_set(&data.gate, 1);
-
- /*
- * Wait here for everyone to have seen the gate change
- * So we're the last ones to touch 'data'
- */
- while (atomic_read(&data.count))
- cpu_relax();
+ stop_machine(mtrr_rendezvous_handler, &data, cpu_online_mask);
+}
- local_irq_restore(flags);
- preempt_enable();
+static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base,
+ unsigned long size, mtrr_type type)
+{
+ struct set_mtrr_data data = { .smp_reg = reg,
+ .smp_base = base,
+ .smp_size = size,
+ .smp_type = type
+ };
+
+ stop_machine_from_inactive_cpu(mtrr_rendezvous_handler, &data,
+ cpu_callout_mask);
}
/**
@@ -783,7 +693,7 @@ void mtrr_ap_init(void)
* 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug
* lock to prevent mtrr entry changes
*/
- set_mtrr(~0U, 0, 0, 0);
+ set_mtrr_from_inactive_cpu(~0U, 0, 0, 0);
}
/**
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index eed3673a865..4ee3abf20ed 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -22,7 +22,6 @@
#include <linux/sched.h>
#include <linux/uaccess.h>
#include <linux/slab.h>
-#include <linux/highmem.h>
#include <linux/cpu.h>
#include <linux/bitops.h>
@@ -31,6 +30,7 @@
#include <asm/nmi.h>
#include <asm/compat.h>
#include <asm/smp.h>
+#include <asm/alternative.h>
#if 0
#undef wrmsrl
@@ -44,38 +44,27 @@ do { \
#endif
/*
- * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
+ * | NHM/WSM | SNB |
+ * register -------------------------------
+ * | HT | no HT | HT | no HT |
+ *-----------------------------------------
+ * offcore | core | core | cpu | core |
+ * lbr_sel | core | core | cpu | core |
+ * ld_lat | cpu | core | cpu | core |
+ *-----------------------------------------
+ *
+ * Given that there is a small number of shared regs,
+ * we can pre-allocate their slot in the per-cpu
+ * per-core reg tables.
*/
-static unsigned long
-copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
-{
- unsigned long offset, addr = (unsigned long)from;
- unsigned long size, len = 0;
- struct page *page;
- void *map;
- int ret;
-
- do {
- ret = __get_user_pages_fast(addr, 1, 0, &page);
- if (!ret)
- break;
-
- offset = addr & (PAGE_SIZE - 1);
- size = min(PAGE_SIZE - offset, n - len);
+enum extra_reg_type {
+ EXTRA_REG_NONE = -1, /* not used */
- map = kmap_atomic(page);
- memcpy(to, map+offset, size);
- kunmap_atomic(map);
- put_page(page);
+ EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */
+ EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */
- len += size;
- to += size;
- addr += size;
-
- } while (len < n);
-
- return len;
-}
+ EXTRA_REG_MAX /* number of entries needed */
+};
struct event_constraint {
union {
@@ -131,11 +120,10 @@ struct cpu_hw_events {
struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
/*
- * Intel percore register state.
- * Coordinate shared resources between HT threads.
+ * manage shared (per-core, per-cpu) registers
+ * used on Intel NHM/WSM/SNB
*/
- int percore_used; /* Used by this CPU? */
- struct intel_percore *per_core;
+ struct intel_shared_regs *shared_regs;
/*
* AMD specific bits
@@ -186,26 +174,45 @@ struct cpu_hw_events {
for ((e) = (c); (e)->weight; (e)++)
/*
+ * Per register state.
+ */
+struct er_account {
+ raw_spinlock_t lock; /* per-core: protect structure */
+ u64 config; /* extra MSR config */
+ u64 reg; /* extra MSR number */
+ atomic_t ref; /* reference count */
+};
+
+/*
* Extra registers for specific events.
+ *
* Some events need large masks and require external MSRs.
- * Define a mapping to these extra registers.
+ * Those extra MSRs end up being shared for all events on
+ * a PMU and sometimes between PMU of sibling HT threads.
+ * In either case, the kernel needs to handle conflicting
+ * accesses to those extra, shared, regs. The data structure
+ * to manage those registers is stored in cpu_hw_event.
*/
struct extra_reg {
unsigned int event;
unsigned int msr;
u64 config_mask;
u64 valid_mask;
+ int idx; /* per_xxx->regs[] reg index */
};
-#define EVENT_EXTRA_REG(e, ms, m, vm) { \
+#define EVENT_EXTRA_REG(e, ms, m, vm, i) { \
.event = (e), \
.msr = (ms), \
.config_mask = (m), \
.valid_mask = (vm), \
+ .idx = EXTRA_REG_##i \
}
-#define INTEL_EVENT_EXTRA_REG(event, msr, vm) \
- EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm)
-#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0)
+
+#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \
+ EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx)
+
+#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0)
union perf_capabilities {
struct {
@@ -251,7 +258,6 @@ struct x86_pmu {
void (*put_event_constraints)(struct cpu_hw_events *cpuc,
struct perf_event *event);
struct event_constraint *event_constraints;
- struct event_constraint *percore_constraints;
void (*quirks)(void);
int perfctr_second_write;
@@ -285,8 +291,12 @@ struct x86_pmu {
* Extra registers for events
*/
struct extra_reg *extra_regs;
+ unsigned int er_flags;
};
+#define ERF_NO_HT_SHARING 1
+#define ERF_HAS_RSP_1 2
+
static struct x86_pmu x86_pmu __read_mostly;
static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
@@ -363,12 +373,18 @@ again:
return new_raw_count;
}
-/* using X86_FEATURE_PERFCTR_CORE to later implement ALTERNATIVE() here */
static inline int x86_pmu_addr_offset(int index)
{
- if (boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
- return index << 1;
- return index;
+ int offset;
+
+ /* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */
+ alternative_io(ASM_NOP2,
+ "shll $1, %%eax",
+ X86_FEATURE_PERFCTR_CORE,
+ "=a" (offset),
+ "a" (index));
+
+ return offset;
}
static inline unsigned int x86_pmu_config_addr(int index)
@@ -386,10 +402,10 @@ static inline unsigned int x86_pmu_event_addr(int index)
*/
static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
{
+ struct hw_perf_event_extra *reg;
struct extra_reg *er;
- event->hw.extra_reg = 0;
- event->hw.extra_config = 0;
+ reg = &event->hw.extra_reg;
if (!x86_pmu.extra_regs)
return 0;
@@ -399,8 +415,10 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
continue;
if (event->attr.config1 & ~er->valid_mask)
return -EINVAL;
- event->hw.extra_reg = er->msr;
- event->hw.extra_config = event->attr.config1;
+
+ reg->idx = er->idx;
+ reg->config = event->attr.config1;
+ reg->reg = er->msr;
break;
}
return 0;
@@ -586,8 +604,12 @@ static int x86_setup_perfctr(struct perf_event *event)
return -EOPNOTSUPP;
}
+ /*
+ * Do not allow config1 (extended registers) to propagate,
+ * there's no sane user-space generalization yet:
+ */
if (attr->type == PERF_TYPE_RAW)
- return x86_pmu_extra_regs(event->attr.config, event);
+ return 0;
if (attr->type == PERF_TYPE_HW_CACHE)
return set_ext_hw_attr(hwc, event);
@@ -609,8 +631,8 @@ static int x86_setup_perfctr(struct perf_event *event)
/*
* Branch tracing:
*/
- if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
- (hwc->sample_period == 1)) {
+ if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
+ !attr->freq && hwc->sample_period == 1) {
/* BTS is not supported by this architecture. */
if (!x86_pmu.bts_active)
return -EOPNOTSUPP;
@@ -695,6 +717,9 @@ static int __x86_pmu_event_init(struct perf_event *event)
event->hw.last_cpu = -1;
event->hw.last_tag = ~0ULL;
+ /* mark unused */
+ event->hw.extra_reg.idx = EXTRA_REG_NONE;
+
return x86_pmu.hw_config(event);
}
@@ -736,8 +761,8 @@ static void x86_pmu_disable(struct pmu *pmu)
static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
u64 enable_mask)
{
- if (hwc->extra_reg)
- wrmsrl(hwc->extra_reg, hwc->extra_config);
+ if (hwc->extra_reg.reg)
+ wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
wrmsrl(hwc->config_base, hwc->config | enable_mask);
}
@@ -1284,6 +1309,16 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
cpuc = &__get_cpu_var(cpu_hw_events);
+ /*
+ * Some chipsets need to unmask the LVTPC in a particular spot
+ * inside the nmi handler. As a result, the unmasking was pushed
+ * into all the nmi handlers.
+ *
+ * This generic handler doesn't seem to have any issues where the
+ * unmasking occurs so it was left at the top.
+ */
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
if (!test_bit(idx, cpuc->active_mask)) {
/*
@@ -1311,7 +1346,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
if (!x86_perf_event_set_period(event))
continue;
- if (perf_event_overflow(event, 1, &data, regs))
+ if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
}
@@ -1370,8 +1405,6 @@ perf_event_nmi_handler(struct notifier_block *self,
return NOTIFY_DONE;
}
- apic_write(APIC_LVTPC, APIC_DM_NMI);
-
handled = x86_pmu.handle_irq(args->regs);
if (!handled)
return NOTIFY_DONE;
@@ -1618,6 +1651,40 @@ static int x86_pmu_commit_txn(struct pmu *pmu)
perf_pmu_enable(pmu);
return 0;
}
+/*
+ * a fake_cpuc is used to validate event groups. Due to
+ * the extra reg logic, we need to also allocate a fake
+ * per_core and per_cpu structure. Otherwise, group events
+ * using extra reg may conflict without the kernel being
+ * able to catch this when the last event gets added to
+ * the group.
+ */
+static void free_fake_cpuc(struct cpu_hw_events *cpuc)
+{
+ kfree(cpuc->shared_regs);
+ kfree(cpuc);
+}
+
+static struct cpu_hw_events *allocate_fake_cpuc(void)
+{
+ struct cpu_hw_events *cpuc;
+ int cpu = raw_smp_processor_id();
+
+ cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
+ if (!cpuc)
+ return ERR_PTR(-ENOMEM);
+
+ /* only needed, if we have extra_regs */
+ if (x86_pmu.extra_regs) {
+ cpuc->shared_regs = allocate_shared_regs(cpu);
+ if (!cpuc->shared_regs)
+ goto error;
+ }
+ return cpuc;
+error:
+ free_fake_cpuc(cpuc);
+ return ERR_PTR(-ENOMEM);
+}
/*
* validate that we can schedule this event
@@ -1628,9 +1695,9 @@ static int validate_event(struct perf_event *event)
struct event_constraint *c;
int ret = 0;
- fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
- if (!fake_cpuc)
- return -ENOMEM;
+ fake_cpuc = allocate_fake_cpuc();
+ if (IS_ERR(fake_cpuc))
+ return PTR_ERR(fake_cpuc);
c = x86_pmu.get_event_constraints(fake_cpuc, event);
@@ -1640,7 +1707,7 @@ static int validate_event(struct perf_event *event)
if (x86_pmu.put_event_constraints)
x86_pmu.put_event_constraints(fake_cpuc, event);
- kfree(fake_cpuc);
+ free_fake_cpuc(fake_cpuc);
return ret;
}
@@ -1660,36 +1727,32 @@ static int validate_group(struct perf_event *event)
{
struct perf_event *leader = event->group_leader;
struct cpu_hw_events *fake_cpuc;
- int ret, n;
-
- ret = -ENOMEM;
- fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
- if (!fake_cpuc)
- goto out;
+ int ret = -ENOSPC, n;
+ fake_cpuc = allocate_fake_cpuc();
+ if (IS_ERR(fake_cpuc))
+ return PTR_ERR(fake_cpuc);
/*
* the event is not yet connected with its
* siblings therefore we must first collect
* existing siblings, then add the new event
* before we can simulate the scheduling
*/
- ret = -ENOSPC;
n = collect_events(fake_cpuc, leader, true);
if (n < 0)
- goto out_free;
+ goto out;
fake_cpuc->n_events = n;
n = collect_events(fake_cpuc, event, false);
if (n < 0)
- goto out_free;
+ goto out;
fake_cpuc->n_events = n;
ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
-out_free:
- kfree(fake_cpuc);
out:
+ free_fake_cpuc(fake_cpuc);
return ret;
}
@@ -1754,17 +1817,6 @@ static struct pmu pmu = {
* callchain support
*/
-static void
-backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
- /* Ignore warnings */
-}
-
-static void backtrace_warning(void *data, char *msg)
-{
- /* Ignore warnings */
-}
-
static int backtrace_stack(void *data, char *name)
{
return 0;
@@ -1778,8 +1830,6 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
}
static const struct stacktrace_ops backtrace_ops = {
- .warning = backtrace_warning,
- .warning_symbol = backtrace_warning_symbol,
.stack = backtrace_stack,
.address = backtrace_address,
.walk_stack = print_context_stack_bp,
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 461f62bbd77..941caa2e449 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -8,7 +8,7 @@ static __initconst const u64 amd_hw_cache_event_ids
[ C(L1D) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
- [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */
+ [ C(RESULT_MISS) ] = 0x0141, /* Data Cache Misses */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
@@ -89,6 +89,20 @@ static __initconst const u64 amd_hw_cache_event_ids
[ C(RESULT_MISS) ] = -1,
},
},
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */
+ [ C(RESULT_MISS) ] = 0x98e9, /* CPU Request to Memory, r */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
};
/*
@@ -96,12 +110,14 @@ static __initconst const u64 amd_hw_cache_event_ids
*/
static const u64 amd_perfmon_event_map[] =
{
- [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
- [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
- [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
- [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
- [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
+ [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */
+ [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */
};
static u64 amd_pmu_event_map(int hw_event)
@@ -427,7 +443,9 @@ static __initconst const struct x86_pmu amd_pmu = {
*
* Exceptions:
*
+ * 0x000 FP PERF_CTL[3], PERF_CTL[5:3] (*)
* 0x003 FP PERF_CTL[3]
+ * 0x004 FP PERF_CTL[3], PERF_CTL[5:3] (*)
* 0x00B FP PERF_CTL[3]
* 0x00D FP PERF_CTL[3]
* 0x023 DE PERF_CTL[2:0]
@@ -448,6 +466,8 @@ static __initconst const struct x86_pmu amd_pmu = {
* 0x0DF LS PERF_CTL[5:0]
* 0x1D6 EX PERF_CTL[5:0]
* 0x1D8 EX PERF_CTL[5:0]
+ *
+ * (*) depending on the umask all FPU counters may be used
*/
static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0);
@@ -460,18 +480,28 @@ static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
static struct event_constraint *
amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event)
{
- unsigned int event_code = amd_get_event_code(&event->hw);
+ struct hw_perf_event *hwc = &event->hw;
+ unsigned int event_code = amd_get_event_code(hwc);
switch (event_code & AMD_EVENT_TYPE_MASK) {
case AMD_EVENT_FP:
switch (event_code) {
+ case 0x000:
+ if (!(hwc->config & 0x0000F000ULL))
+ break;
+ if (!(hwc->config & 0x00000F00ULL))
+ break;
+ return &amd_f15_PMC3;
+ case 0x004:
+ if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
+ break;
+ return &amd_f15_PMC3;
case 0x003:
case 0x00B:
case 0x00D:
return &amd_f15_PMC3;
- default:
- return &amd_f15_PMC53;
}
+ return &amd_f15_PMC53;
case AMD_EVENT_LS:
case AMD_EVENT_DC:
case AMD_EVENT_EX_LS:
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 8fc2b2cee1d..45fbb8f7f54 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1,31 +1,21 @@
#ifdef CONFIG_CPU_SUP_INTEL
-#define MAX_EXTRA_REGS 2
-
-/*
- * Per register state.
- */
-struct er_account {
- int ref; /* reference count */
- unsigned int extra_reg; /* extra MSR number */
- u64 extra_config; /* extra MSR config */
-};
-
/*
- * Per core state
- * This used to coordinate shared registers for HT threads.
+ * Per core/cpu state
+ *
+ * Used to coordinate shared registers between HT threads or
+ * among events on a single PMU.
*/
-struct intel_percore {
- raw_spinlock_t lock; /* protect structure */
- struct er_account regs[MAX_EXTRA_REGS];
- int refcnt; /* number of threads */
- unsigned core_id;
+struct intel_shared_regs {
+ struct er_account regs[EXTRA_REG_MAX];
+ int refcnt; /* per-core: #HT threads */
+ unsigned core_id; /* per-core: core id */
};
/*
* Intel PerfMon, used on Core and later.
*/
-static const u64 intel_perfmon_event_map[] =
+static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
{
[PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
[PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
@@ -36,7 +26,7 @@ static const u64 intel_perfmon_event_map[] =
[PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
};
-static struct event_constraint intel_core_event_constraints[] =
+static struct event_constraint intel_core_event_constraints[] __read_mostly =
{
INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
@@ -47,7 +37,7 @@ static struct event_constraint intel_core_event_constraints[] =
EVENT_CONSTRAINT_END
};
-static struct event_constraint intel_core2_event_constraints[] =
+static struct event_constraint intel_core2_event_constraints[] __read_mostly =
{
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -70,7 +60,7 @@ static struct event_constraint intel_core2_event_constraints[] =
EVENT_CONSTRAINT_END
};
-static struct event_constraint intel_nehalem_event_constraints[] =
+static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
{
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -86,19 +76,13 @@ static struct event_constraint intel_nehalem_event_constraints[] =
EVENT_CONSTRAINT_END
};
-static struct extra_reg intel_nehalem_extra_regs[] =
+static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
{
- INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
+ INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
EVENT_EXTRA_END
};
-static struct event_constraint intel_nehalem_percore_constraints[] =
-{
- INTEL_EVENT_CONSTRAINT(0xb7, 0),
- EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_westmere_event_constraints[] =
+static struct event_constraint intel_westmere_event_constraints[] __read_mostly =
{
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -110,34 +94,30 @@ static struct event_constraint intel_westmere_event_constraints[] =
EVENT_CONSTRAINT_END
};
-static struct event_constraint intel_snb_event_constraints[] =
+static struct event_constraint intel_snb_event_constraints[] __read_mostly =
{
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
/* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
- INTEL_EVENT_CONSTRAINT(0xb7, 0x1), /* OFF_CORE_RESPONSE_0 */
- INTEL_EVENT_CONSTRAINT(0xbb, 0x8), /* OFF_CORE_RESPONSE_1 */
INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
EVENT_CONSTRAINT_END
};
-static struct extra_reg intel_westmere_extra_regs[] =
+static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
{
- INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
- INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff),
+ INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
+ INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1),
EVENT_EXTRA_END
};
-static struct event_constraint intel_westmere_percore_constraints[] =
+static struct event_constraint intel_v1_event_constraints[] __read_mostly =
{
- INTEL_EVENT_CONSTRAINT(0xb7, 0),
- INTEL_EVENT_CONSTRAINT(0xbb, 0),
EVENT_CONSTRAINT_END
};
-static struct event_constraint intel_gen_event_constraints[] =
+static struct event_constraint intel_gen_event_constraints[] __read_mostly =
{
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -145,6 +125,12 @@ static struct event_constraint intel_gen_event_constraints[] =
EVENT_CONSTRAINT_END
};
+static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
+ INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0),
+ INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1),
+ EVENT_EXTRA_END
+};
+
static u64 intel_pmu_event_map(int hw_event)
{
return intel_perfmon_event_map[hw_event];
@@ -184,26 +170,23 @@ static __initconst const u64 snb_hw_cache_event_ids
},
},
[ C(LL ) ] = {
- /*
- * TBD: Need Off-core Response Performance Monitoring support
- */
[ C(OP_READ) ] = {
- /* OFFCORE_RESPONSE_0.ANY_DATA.LOCAL_CACHE */
+ /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
[ C(RESULT_ACCESS) ] = 0x01b7,
- /* OFFCORE_RESPONSE_1.ANY_DATA.ANY_LLC_MISS */
- [ C(RESULT_MISS) ] = 0x01bb,
+ /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
},
[ C(OP_WRITE) ] = {
- /* OFFCORE_RESPONSE_0.ANY_RFO.LOCAL_CACHE */
+ /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
[ C(RESULT_ACCESS) ] = 0x01b7,
- /* OFFCORE_RESPONSE_1.ANY_RFO.ANY_LLC_MISS */
- [ C(RESULT_MISS) ] = 0x01bb,
+ /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
},
[ C(OP_PREFETCH) ] = {
- /* OFFCORE_RESPONSE_0.PREFETCH.LOCAL_CACHE */
+ /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
[ C(RESULT_ACCESS) ] = 0x01b7,
- /* OFFCORE_RESPONSE_1.PREFETCH.ANY_LLC_MISS */
- [ C(RESULT_MISS) ] = 0x01bb,
+ /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
},
},
[ C(DTLB) ] = {
@@ -248,6 +231,21 @@ static __initconst const u64 snb_hw_cache_event_ids
[ C(RESULT_MISS) ] = -1,
},
},
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+
};
static __initconst const u64 westmere_hw_cache_event_ids
@@ -285,26 +283,26 @@ static __initconst const u64 westmere_hw_cache_event_ids
},
[ C(LL ) ] = {
[ C(OP_READ) ] = {
- /* OFFCORE_RESPONSE_0.ANY_DATA.LOCAL_CACHE */
+ /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
[ C(RESULT_ACCESS) ] = 0x01b7,
- /* OFFCORE_RESPONSE_1.ANY_DATA.ANY_LLC_MISS */
- [ C(RESULT_MISS) ] = 0x01bb,
+ /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
},
/*
* Use RFO, not WRITEBACK, because a write miss would typically occur
* on RFO.
*/
[ C(OP_WRITE) ] = {
- /* OFFCORE_RESPONSE_1.ANY_RFO.LOCAL_CACHE */
- [ C(RESULT_ACCESS) ] = 0x01bb,
- /* OFFCORE_RESPONSE_0.ANY_RFO.ANY_LLC_MISS */
+ /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
[ C(RESULT_MISS) ] = 0x01b7,
},
[ C(OP_PREFETCH) ] = {
- /* OFFCORE_RESPONSE_0.PREFETCH.LOCAL_CACHE */
+ /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
[ C(RESULT_ACCESS) ] = 0x01b7,
- /* OFFCORE_RESPONSE_1.PREFETCH.ANY_LLC_MISS */
- [ C(RESULT_MISS) ] = 0x01bb,
+ /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
},
},
[ C(DTLB) ] = {
@@ -349,19 +347,53 @@ static __initconst const u64 westmere_hw_cache_event_ids
[ C(RESULT_MISS) ] = -1,
},
},
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ },
};
/*
- * OFFCORE_RESPONSE MSR bits (subset), See IA32 SDM Vol 3 30.6.1.3
+ * Nehalem/Westmere MSR_OFFCORE_RESPONSE bits;
+ * See IA32 SDM Vol 3B 30.6.1.3
*/
-#define DMND_DATA_RD (1 << 0)
-#define DMND_RFO (1 << 1)
-#define DMND_WB (1 << 3)
-#define PF_DATA_RD (1 << 4)
-#define PF_DATA_RFO (1 << 5)
-#define RESP_UNCORE_HIT (1 << 8)
-#define RESP_MISS (0xf600) /* non uncore hit */
+#define NHM_DMND_DATA_RD (1 << 0)
+#define NHM_DMND_RFO (1 << 1)
+#define NHM_DMND_IFETCH (1 << 2)
+#define NHM_DMND_WB (1 << 3)
+#define NHM_PF_DATA_RD (1 << 4)
+#define NHM_PF_DATA_RFO (1 << 5)
+#define NHM_PF_IFETCH (1 << 6)
+#define NHM_OFFCORE_OTHER (1 << 7)
+#define NHM_UNCORE_HIT (1 << 8)
+#define NHM_OTHER_CORE_HIT_SNP (1 << 9)
+#define NHM_OTHER_CORE_HITM (1 << 10)
+ /* reserved */
+#define NHM_REMOTE_CACHE_FWD (1 << 12)
+#define NHM_REMOTE_DRAM (1 << 13)
+#define NHM_LOCAL_DRAM (1 << 14)
+#define NHM_NON_DRAM (1 << 15)
+
+#define NHM_ALL_DRAM (NHM_REMOTE_DRAM|NHM_LOCAL_DRAM)
+
+#define NHM_DMND_READ (NHM_DMND_DATA_RD)
+#define NHM_DMND_WRITE (NHM_DMND_RFO|NHM_DMND_WB)
+#define NHM_DMND_PREFETCH (NHM_PF_DATA_RD|NHM_PF_DATA_RFO)
+
+#define NHM_L3_HIT (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM)
+#define NHM_L3_MISS (NHM_NON_DRAM|NHM_ALL_DRAM|NHM_REMOTE_CACHE_FWD)
+#define NHM_L3_ACCESS (NHM_L3_HIT|NHM_L3_MISS)
static __initconst const u64 nehalem_hw_cache_extra_regs
[PERF_COUNT_HW_CACHE_MAX]
@@ -370,18 +402,32 @@ static __initconst const u64 nehalem_hw_cache_extra_regs
{
[ C(LL ) ] = {
[ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = DMND_DATA_RD|RESP_UNCORE_HIT,
- [ C(RESULT_MISS) ] = DMND_DATA_RD|RESP_MISS,
+ [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_L3_ACCESS,
+ [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_L3_MISS,
},
[ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = DMND_RFO|DMND_WB|RESP_UNCORE_HIT,
- [ C(RESULT_MISS) ] = DMND_RFO|DMND_WB|RESP_MISS,
+ [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_L3_ACCESS,
+ [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_L3_MISS,
},
[ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = PF_DATA_RD|PF_DATA_RFO|RESP_UNCORE_HIT,
- [ C(RESULT_MISS) ] = PF_DATA_RD|PF_DATA_RFO|RESP_MISS,
+ [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS,
+ [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_L3_MISS,
},
- }
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_ALL_DRAM,
+ [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE_DRAM,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_ALL_DRAM,
+ [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE_DRAM,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_ALL_DRAM,
+ [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE_DRAM,
+ },
+ },
};
static __initconst const u64 nehalem_hw_cache_event_ids
@@ -391,12 +437,12 @@ static __initconst const u64 nehalem_hw_cache_event_ids
{
[ C(L1D) ] = {
[ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
- [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
+ [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
+ [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */
},
[ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
- [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
+ [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
+ [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
@@ -483,6 +529,20 @@ static __initconst const u64 nehalem_hw_cache_event_ids
[ C(RESULT_MISS) ] = -1,
},
},
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ },
};
static __initconst const u64 core2_hw_cache_event_ids
@@ -933,6 +993,16 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
cpuc = &__get_cpu_var(cpu_hw_events);
+ /*
+ * Some chipsets need to unmask the LVTPC in a particular spot
+ * inside the nmi handler. As a result, the unmasking was pushed
+ * into all the nmi handlers.
+ *
+ * This handler doesn't seem to have any issues with the unmasking
+ * so it was left at the top.
+ */
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+
intel_pmu_disable_all();
handled = intel_pmu_drain_bts_buffer();
status = intel_pmu_get_status();
@@ -976,7 +1046,7 @@ again:
data.period = event->hw.last_period;
- if (perf_event_overflow(event, 1, &data, regs))
+ if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
}
@@ -998,6 +1068,9 @@ intel_bts_constraints(struct perf_event *event)
struct hw_perf_event *hwc = &event->hw;
unsigned int hw_event, bts_event;
+ if (event->attr.freq)
+ return NULL;
+
hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
@@ -1007,65 +1080,121 @@ intel_bts_constraints(struct perf_event *event)
return NULL;
}
+static bool intel_try_alt_er(struct perf_event *event, int orig_idx)
+{
+ if (!(x86_pmu.er_flags & ERF_HAS_RSP_1))
+ return false;
+
+ if (event->hw.extra_reg.idx == EXTRA_REG_RSP_0) {
+ event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
+ event->hw.config |= 0x01bb;
+ event->hw.extra_reg.idx = EXTRA_REG_RSP_1;
+ event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
+ } else if (event->hw.extra_reg.idx == EXTRA_REG_RSP_1) {
+ event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
+ event->hw.config |= 0x01b7;
+ event->hw.extra_reg.idx = EXTRA_REG_RSP_0;
+ event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
+ }
+
+ if (event->hw.extra_reg.idx == orig_idx)
+ return false;
+
+ return true;
+}
+
+/*
+ * manage allocation of shared extra msr for certain events
+ *
+ * sharing can be:
+ * per-cpu: to be shared between the various events on a single PMU
+ * per-core: per-cpu + shared by HT threads
+ */
static struct event_constraint *
-intel_percore_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
{
- struct hw_perf_event *hwc = &event->hw;
- unsigned int e = hwc->config & ARCH_PERFMON_EVENTSEL_EVENT;
- struct event_constraint *c;
- struct intel_percore *pc;
+ struct event_constraint *c = &emptyconstraint;
+ struct hw_perf_event_extra *reg = &event->hw.extra_reg;
struct er_account *era;
- int i;
- int free_slot;
- int found;
+ unsigned long flags;
+ int orig_idx = reg->idx;
- if (!x86_pmu.percore_constraints || hwc->extra_alloc)
- return NULL;
+ /* already allocated shared msr */
+ if (reg->alloc)
+ return &unconstrained;
- for (c = x86_pmu.percore_constraints; c->cmask; c++) {
- if (e != c->code)
- continue;
+again:
+ era = &cpuc->shared_regs->regs[reg->idx];
+ /*
+ * we use spin_lock_irqsave() to avoid lockdep issues when
+ * passing a fake cpuc
+ */
+ raw_spin_lock_irqsave(&era->lock, flags);
+
+ if (!atomic_read(&era->ref) || era->config == reg->config) {
+
+ /* lock in msr value */
+ era->config = reg->config;
+ era->reg = reg->reg;
+
+ /* one more user */
+ atomic_inc(&era->ref);
+
+ /* no need to reallocate during incremental event scheduling */
+ reg->alloc = 1;
/*
- * Allocate resource per core.
+ * All events using extra_reg are unconstrained.
+ * Avoids calling x86_get_event_constraints()
+ *
+ * Must revisit if extra_reg controlling events
+ * ever have constraints. Worst case we go through
+ * the regular event constraint table.
*/
- pc = cpuc->per_core;
- if (!pc)
- break;
- c = &emptyconstraint;
- raw_spin_lock(&pc->lock);
- free_slot = -1;
- found = 0;
- for (i = 0; i < MAX_EXTRA_REGS; i++) {
- era = &pc->regs[i];
- if (era->ref > 0 && hwc->extra_reg == era->extra_reg) {
- /* Allow sharing same config */
- if (hwc->extra_config == era->extra_config) {
- era->ref++;
- cpuc->percore_used = 1;
- hwc->extra_alloc = 1;
- c = NULL;
- }
- /* else conflict */
- found = 1;
- break;
- } else if (era->ref == 0 && free_slot == -1)
- free_slot = i;
- }
- if (!found && free_slot != -1) {
- era = &pc->regs[free_slot];
- era->ref = 1;
- era->extra_reg = hwc->extra_reg;
- era->extra_config = hwc->extra_config;
- cpuc->percore_used = 1;
- hwc->extra_alloc = 1;
- c = NULL;
- }
- raw_spin_unlock(&pc->lock);
- return c;
+ c = &unconstrained;
+ } else if (intel_try_alt_er(event, orig_idx)) {
+ raw_spin_unlock(&era->lock);
+ goto again;
}
+ raw_spin_unlock_irqrestore(&era->lock, flags);
- return NULL;
+ return c;
+}
+
+static void
+__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
+ struct hw_perf_event_extra *reg)
+{
+ struct er_account *era;
+
+ /*
+ * only put constraint if extra reg was actually
+ * allocated. Also takes care of event which do
+ * not use an extra shared reg
+ */
+ if (!reg->alloc)
+ return;
+
+ era = &cpuc->shared_regs->regs[reg->idx];
+
+ /* one fewer user */
+ atomic_dec(&era->ref);
+
+ /* allocate again next time */
+ reg->alloc = 0;
+}
+
+static struct event_constraint *
+intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ struct event_constraint *c = NULL;
+
+ if (event->hw.extra_reg.idx != EXTRA_REG_NONE)
+ c = __intel_shared_reg_get_constraints(cpuc, event);
+
+ return c;
}
static struct event_constraint *
@@ -1081,49 +1210,28 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
if (c)
return c;
- c = intel_percore_constraints(cpuc, event);
+ c = intel_shared_regs_constraints(cpuc, event);
if (c)
return c;
return x86_get_event_constraints(cpuc, event);
}
-static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
+static void
+intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
struct perf_event *event)
{
- struct extra_reg *er;
- struct intel_percore *pc;
- struct er_account *era;
- struct hw_perf_event *hwc = &event->hw;
- int i, allref;
-
- if (!cpuc->percore_used)
- return;
+ struct hw_perf_event_extra *reg;
- for (er = x86_pmu.extra_regs; er->msr; er++) {
- if (er->event != (hwc->config & er->config_mask))
- continue;
+ reg = &event->hw.extra_reg;
+ if (reg->idx != EXTRA_REG_NONE)
+ __intel_shared_reg_put_constraints(cpuc, reg);
+}
- pc = cpuc->per_core;
- raw_spin_lock(&pc->lock);
- for (i = 0; i < MAX_EXTRA_REGS; i++) {
- era = &pc->regs[i];
- if (era->ref > 0 &&
- era->extra_config == hwc->extra_config &&
- era->extra_reg == er->msr) {
- era->ref--;
- hwc->extra_alloc = 0;
- break;
- }
- }
- allref = 0;
- for (i = 0; i < MAX_EXTRA_REGS; i++)
- allref += pc->regs[i].ref;
- if (allref == 0)
- cpuc->percore_used = 0;
- raw_spin_unlock(&pc->lock);
- break;
- }
+static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ intel_put_shared_regs_event_constraints(cpuc, event);
}
static int intel_pmu_hw_config(struct perf_event *event)
@@ -1201,20 +1309,36 @@ static __initconst const struct x86_pmu core_pmu = {
.event_constraints = intel_core_event_constraints,
};
+static struct intel_shared_regs *allocate_shared_regs(int cpu)
+{
+ struct intel_shared_regs *regs;
+ int i;
+
+ regs = kzalloc_node(sizeof(struct intel_shared_regs),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (regs) {
+ /*
+ * initialize the locks to keep lockdep happy
+ */
+ for (i = 0; i < EXTRA_REG_MAX; i++)
+ raw_spin_lock_init(&regs->regs[i].lock);
+
+ regs->core_id = -1;
+ }
+ return regs;
+}
+
static int intel_pmu_cpu_prepare(int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
- if (!cpu_has_ht_siblings())
+ if (!x86_pmu.extra_regs)
return NOTIFY_OK;
- cpuc->per_core = kzalloc_node(sizeof(struct intel_percore),
- GFP_KERNEL, cpu_to_node(cpu));
- if (!cpuc->per_core)
+ cpuc->shared_regs = allocate_shared_regs(cpu);
+ if (!cpuc->shared_regs)
return NOTIFY_BAD;
- raw_spin_lock_init(&cpuc->per_core->lock);
- cpuc->per_core->core_id = -1;
return NOTIFY_OK;
}
@@ -1230,32 +1354,34 @@ static void intel_pmu_cpu_starting(int cpu)
*/
intel_pmu_lbr_reset();
- if (!cpu_has_ht_siblings())
+ if (!cpuc->shared_regs || (x86_pmu.er_flags & ERF_NO_HT_SHARING))
return;
for_each_cpu(i, topology_thread_cpumask(cpu)) {
- struct intel_percore *pc = per_cpu(cpu_hw_events, i).per_core;
+ struct intel_shared_regs *pc;
+ pc = per_cpu(cpu_hw_events, i).shared_regs;
if (pc && pc->core_id == core_id) {
- kfree(cpuc->per_core);
- cpuc->per_core = pc;
+ kfree(cpuc->shared_regs);
+ cpuc->shared_regs = pc;
break;
}
}
- cpuc->per_core->core_id = core_id;
- cpuc->per_core->refcnt++;
+ cpuc->shared_regs->core_id = core_id;
+ cpuc->shared_regs->refcnt++;
}
static void intel_pmu_cpu_dying(int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
- struct intel_percore *pc = cpuc->per_core;
+ struct intel_shared_regs *pc;
+ pc = cpuc->shared_regs;
if (pc) {
if (pc->core_id == -1 || --pc->refcnt == 0)
kfree(pc);
- cpuc->per_core = NULL;
+ cpuc->shared_regs = NULL;
}
fini_debug_store_on_cpu(cpu);
@@ -1305,7 +1431,7 @@ static void intel_clovertown_quirks(void)
* AJ106 could possibly be worked around by not allowing LBR
* usage from PEBS, including the fixup.
* AJ68 could possibly be worked around by always programming
- * a pebs_event_reset[0] value and coping with the lost events.
+ * a pebs_event_reset[0] value and coping with the lost events.
*
* But taken together it might just make sense to not enable PEBS on
* these chips.
@@ -1406,9 +1532,25 @@ static __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_nehalem_event_constraints;
x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
- x86_pmu.percore_constraints = intel_nehalem_percore_constraints;
x86_pmu.enable_all = intel_pmu_nhm_enable_all;
x86_pmu.extra_regs = intel_nehalem_extra_regs;
+
+ /* UOPS_ISSUED.STALLED_CYCLES */
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
+ /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
+
+ if (ebx & 0x40) {
+ /*
+ * Erratum AAJ80 detected, we work it around by using
+ * the BR_MISP_EXEC.ANY event. This will over-count
+ * branch-misses, but it's still much better than the
+ * architectural event which is often completely bogus:
+ */
+ intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
+
+ pr_cont("erratum AAJ80 worked around, ");
+ }
pr_cont("Nehalem events, ");
break;
@@ -1425,6 +1567,7 @@ static __init int intel_pmu_init(void)
case 37: /* 32 nm nehalem, "Clarkdale" */
case 44: /* 32 nm nehalem, "Gulftown" */
+ case 47: /* 32 nm Xeon E7 */
memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
@@ -1433,10 +1576,16 @@ static __init int intel_pmu_init(void)
intel_pmu_lbr_init_nhm();
x86_pmu.event_constraints = intel_westmere_event_constraints;
- x86_pmu.percore_constraints = intel_westmere_percore_constraints;
x86_pmu.enable_all = intel_pmu_nhm_enable_all;
x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
x86_pmu.extra_regs = intel_westmere_extra_regs;
+ x86_pmu.er_flags |= ERF_HAS_RSP_1;
+
+ /* UOPS_ISSUED.STALLED_CYCLES */
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
+ /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
+
pr_cont("Westmere events, ");
break;
@@ -1448,15 +1597,33 @@ static __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_snb_event_constraints;
x86_pmu.pebs_constraints = intel_snb_pebs_events;
+ x86_pmu.extra_regs = intel_snb_extra_regs;
+ /* all extra regs are per-cpu when HT is on */
+ x86_pmu.er_flags |= ERF_HAS_RSP_1;
+ x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+
+ /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
+ /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x18001b1;
+
pr_cont("SandyBridge events, ");
break;
default:
- /*
- * default constraints for v2 and up
- */
- x86_pmu.event_constraints = intel_gen_event_constraints;
- pr_cont("generic architected perfmon, ");
+ switch (x86_pmu.version) {
+ case 1:
+ x86_pmu.event_constraints = intel_v1_event_constraints;
+ pr_cont("generic architected perfmon v1, ");
+ break;
+ default:
+ /*
+ * default constraints for v2 and up
+ */
+ x86_pmu.event_constraints = intel_gen_event_constraints;
+ pr_cont("generic architected perfmon, ");
+ break;
+ }
}
return 0;
}
@@ -1468,4 +1635,8 @@ static int intel_pmu_init(void)
return 0;
}
+static struct intel_shared_regs *allocate_shared_regs(int cpu)
+{
+ return NULL;
+}
#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index bab491b8ee2..1b1ef3addcf 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -340,7 +340,7 @@ static int intel_pmu_drain_bts_buffer(void)
*/
perf_prepare_sample(&header, &data, event, &regs);
- if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
+ if (perf_output_begin(&handle, event, header.size * (top - at)))
return 1;
for (; at < top; at++) {
@@ -616,7 +616,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
else
regs.flags &= ~PERF_EFLAGS_EXACT;
- if (perf_event_overflow(event, 1, &data, &regs))
+ if (perf_event_overflow(event, &data, &regs))
x86_pmu_stop(event, 0);
}
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index c2520e178d3..7809d2bcb20 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -468,7 +468,7 @@ static struct p4_event_bind p4_event_bind_map[] = {
.opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
.escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
.escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
+ P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
.cntr = { {12, 13, 16}, {14, 15, 17} },
},
[P4_EVENT_X87_ASSIST] = {
@@ -554,13 +554,102 @@ static __initconst const u64 p4_hw_cache_event_ids
[ C(RESULT_MISS) ] = -1,
},
},
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+/*
+ * Because of Netburst being quite restricted in how many
+ * identical events may run simultaneously, we introduce event aliases,
+ * ie the different events which have the same functionality but
+ * utilize non-intersected resources (ESCR/CCCR/counter registers).
+ *
+ * This allow us to relax restrictions a bit and run two or more
+ * identical events together.
+ *
+ * Never set any custom internal bits such as P4_CONFIG_HT,
+ * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are
+ * either up to date automatically or not applicable at all.
+ */
+struct p4_event_alias {
+ u64 original;
+ u64 alternative;
+} p4_event_aliases[] = {
+ {
+ /*
+ * Non-halted cycles can be substituted with non-sleeping cycles (see
+ * Intel SDM Vol3b for details). We need this alias to be able
+ * to run nmi-watchdog and 'perf top' (or any other user space tool
+ * which is interested in running PERF_COUNT_HW_CPU_CYCLES)
+ * simultaneously.
+ */
+ .original =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
+ .alternative =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)|
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)|
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)|
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)|
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))|
+ p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT |
+ P4_CCCR_COMPARE),
+ },
};
+static u64 p4_get_alias_event(u64 config)
+{
+ u64 config_match;
+ int i;
+
+ /*
+ * Only event with special mark is allowed,
+ * we're to be sure it didn't come as malformed
+ * RAW event.
+ */
+ if (!(config & P4_CONFIG_ALIASABLE))
+ return 0;
+
+ config_match = config & P4_CONFIG_EVENT_ALIAS_MASK;
+
+ for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) {
+ if (config_match == p4_event_aliases[i].original) {
+ config_match = p4_event_aliases[i].alternative;
+ break;
+ } else if (config_match == p4_event_aliases[i].alternative) {
+ config_match = p4_event_aliases[i].original;
+ break;
+ }
+ }
+
+ if (i >= ARRAY_SIZE(p4_event_aliases))
+ return 0;
+
+ return config_match | (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS);
+}
+
static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
/* non-halted CPU clocks */
[PERF_COUNT_HW_CPU_CYCLES] =
p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) |
- P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
+ P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)) |
+ P4_CONFIG_ALIASABLE,
/*
* retired instructions
@@ -912,8 +1001,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
int idx, handled = 0;
u64 val;
- data.addr = 0;
- data.raw = NULL;
+ perf_sample_data_init(&data, 0);
cpuc = &__get_cpu_var(cpu_hw_events);
@@ -946,15 +1034,24 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
if (!x86_perf_event_set_period(event))
continue;
- if (perf_event_overflow(event, 1, &data, regs))
- p4_pmu_disable_event(event);
+ if (perf_event_overflow(event, &data, regs))
+ x86_pmu_stop(event, 0);
}
- if (handled) {
- /* p4 quirk: unmask it again */
- apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
+ if (handled)
inc_irq_stat(apic_perf_irqs);
- }
+
+ /*
+ * When dealing with the unmasking of the LVTPC on P4 perf hw, it has
+ * been observed that the OVF bit flag has to be cleared first _before_
+ * the LVTPC can be unmasked.
+ *
+ * The reason is the NMI line will continue to be asserted while the OVF
+ * bit is set. This causes a second NMI to generate if the LVTPC is
+ * unmasked before the OVF bit is cleared, leading to unknown NMI
+ * messages.
+ */
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
return handled;
}
@@ -1112,6 +1209,8 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
struct p4_event_bind *bind;
unsigned int i, thread, num;
int cntr_idx, escr_idx;
+ u64 config_alias;
+ int pass;
bitmap_zero(used_mask, X86_PMC_IDX_MAX);
bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
@@ -1120,6 +1219,17 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
hwc = &cpuc->event_list[i]->hw;
thread = p4_ht_thread(cpu);
+ pass = 0;
+
+again:
+ /*
+ * It's possible to hit a circular lock
+ * between original and alternative events
+ * if both are scheduled already.
+ */
+ if (pass > 2)
+ goto done;
+
bind = p4_config_get_bind(hwc->config);
escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
if (unlikely(escr_idx == -1))
@@ -1133,8 +1243,17 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
}
cntr_idx = p4_next_cntr(thread, used_mask, bind);
- if (cntr_idx == -1 || test_bit(escr_idx, escr_mask))
- goto done;
+ if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) {
+ /*
+ * Check whether an event alias is still available.
+ */
+ config_alias = p4_get_alias_event(hwc->config);
+ if (!config_alias)
+ goto done;
+ hwc->config = config_alias;
+ pass++;
+ goto again;
+ }
p4_pmu_swap_config_ts(hwc, cpu);
if (assign)
@@ -1188,7 +1307,7 @@ static __init int p4_pmu_init(void)
{
unsigned int low, high;
- /* If we get stripped -- indexig fails */
+ /* If we get stripped -- indexing fails */
BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC);
rdmsr(MSR_IA32_MISC_ENABLE, low, high);
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 706a9fb46a5..a621f342768 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -13,6 +13,7 @@
#include <linux/slab.h>
#include <linux/pci.h>
#include <linux/of_pci.h>
+#include <linux/initrd.h>
#include <asm/hpet.h>
#include <asm/irq_controller.h>
@@ -98,6 +99,16 @@ void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
return __alloc_bootmem(size, align, __pa(MAX_DMA_ADDRESS));
}
+#ifdef CONFIG_BLK_DEV_INITRD
+void __init early_init_dt_setup_initrd_arch(unsigned long start,
+ unsigned long end)
+{
+ initrd_start = (unsigned long)__va(start);
+ initrd_end = (unsigned long)__va(end);
+ initrd_below_start_ok = 1;
+}
+#endif
+
void __init add_dtb(u64 data)
{
initial_dtb = data + offsetof(struct setup_data, data);
@@ -123,6 +134,24 @@ static int __init add_bus_probe(void)
module_init(add_bus_probe);
#ifdef CONFIG_PCI
+struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus)
+{
+ struct device_node *np;
+
+ for_each_node_by_type(np, "pci") {
+ const void *prop;
+ unsigned int bus_min;
+
+ prop = of_get_property(np, "bus-range", NULL);
+ if (!prop)
+ continue;
+ bus_min = be32_to_cpup(prop);
+ if (bus->number == bus_min)
+ return np;
+ }
+ return NULL;
+}
+
static int x86_of_pci_irq_enable(struct pci_dev *dev)
{
struct of_irq oirq;
@@ -154,50 +183,8 @@ static void x86_of_pci_irq_disable(struct pci_dev *dev)
void __cpuinit x86_of_pci_init(void)
{
- struct device_node *np;
-
pcibios_enable_irq = x86_of_pci_irq_enable;
pcibios_disable_irq = x86_of_pci_irq_disable;
-
- for_each_node_by_type(np, "pci") {
- const void *prop;
- struct pci_bus *bus;
- unsigned int bus_min;
- struct device_node *child;
-
- prop = of_get_property(np, "bus-range", NULL);
- if (!prop)
- continue;
- bus_min = be32_to_cpup(prop);
-
- bus = pci_find_bus(0, bus_min);
- if (!bus) {
- printk(KERN_ERR "Can't find a node for bus %s.\n",
- np->full_name);
- continue;
- }
-
- if (bus->self)
- bus->self->dev.of_node = np;
- else
- bus->dev.of_node = np;
-
- for_each_child_of_node(np, child) {
- struct pci_dev *dev;
- u32 devfn;
-
- prop = of_get_property(child, "reg", NULL);
- if (!prop)
- continue;
-
- devfn = (be32_to_cpup(prop) >> 8) & 0xff;
- dev = pci_get_slot(bus, devfn);
- if (!dev)
- continue;
- dev->dev.of_node = child;
- pci_dev_put(dev);
- }
- }
}
#endif
@@ -369,6 +356,7 @@ static struct of_ioapic_type of_ioapic_type[] =
static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize,
u32 *out_hwirq, u32 *out_type)
{
+ struct mp_ioapic_gsi *gsi_cfg;
struct io_apic_irq_attr attr;
struct of_ioapic_type *it;
u32 line, idx, type;
@@ -378,7 +366,8 @@ static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize,
line = *intspec;
idx = (u32) id->priv;
- *out_hwirq = line + mp_gsi_routing[idx].gsi_base;
+ gsi_cfg = mp_ioapic_gsi_routing(idx);
+ *out_hwirq = line + gsi_cfg->gsi_base;
intspec++;
type = *intspec;
@@ -391,7 +380,7 @@ static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize,
set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity);
- return io_apic_setup_irq_pin(*out_hwirq, cpu_to_node(0), &attr);
+ return io_apic_setup_irq_pin_once(*out_hwirq, cpu_to_node(0), &attr);
}
static void __init ioapic_add_ofnode(struct device_node *np)
@@ -407,7 +396,7 @@ static void __init ioapic_add_ofnode(struct device_node *np)
}
for (i = 0; i < nr_ioapics; i++) {
- if (r.start == mp_ioapics[i].apicaddr) {
+ if (r.start == mpc_ioapic_addr(i)) {
struct irq_domain *id;
id = kzalloc(sizeof(*id), GFP_KERNEL);
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index e2a3f0606da..1aae78f775f 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -135,20 +135,6 @@ print_context_stack_bp(struct thread_info *tinfo,
}
EXPORT_SYMBOL_GPL(print_context_stack_bp);
-
-static void
-print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
- printk(data);
- print_symbol(msg, symbol);
- printk("\n");
-}
-
-static void print_trace_warning(void *data, char *msg)
-{
- printk("%s%s\n", (char *)data, msg);
-}
-
static int print_trace_stack(void *data, char *name)
{
printk("%s <%s> ", (char *)data, name);
@@ -166,8 +152,6 @@ static void print_trace_address(void *data, unsigned long addr, int reliable)
}
static const struct stacktrace_ops print_trace_ops = {
- .warning = print_trace_warning,
- .warning_symbol = print_trace_warning_symbol,
.stack = print_trace_stack,
.address = print_trace_address,
.walk_stack = print_context_stack,
@@ -279,7 +263,6 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
printk("DEBUG_PAGEALLOC");
#endif
printk("\n");
- sysfs_printk_last_file();
if (notify_die(DIE_OOPS, str, regs, err,
current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
return 1;
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index e71c98d3c0d..19853ad8afc 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -105,34 +105,6 @@ in_irq_stack(unsigned long *stack, unsigned long *irq_stack,
}
/*
- * We are returning from the irq stack and go to the previous one.
- * If the previous stack is also in the irq stack, then bp in the first
- * frame of the irq stack points to the previous, interrupted one.
- * Otherwise we have another level of indirection: We first save
- * the bp of the previous stack, then we switch the stack to the irq one
- * and save a new bp that links to the previous one.
- * (See save_args())
- */
-static inline unsigned long
-fixup_bp_irq_link(unsigned long bp, unsigned long *stack,
- unsigned long *irq_stack, unsigned long *irq_stack_end)
-{
-#ifdef CONFIG_FRAME_POINTER
- struct stack_frame *frame = (struct stack_frame *)bp;
- unsigned long next;
-
- if (!in_irq_stack(stack, irq_stack, irq_stack_end)) {
- if (!probe_kernel_address(&frame->next_frame, next))
- return next;
- else
- WARN_ONCE(1, "Perf: bad frame pointer = %p in "
- "callchain\n", &frame->next_frame);
- }
-#endif
- return bp;
-}
-
-/*
* x86-64 can have up to three kernel stacks:
* process stack
* interrupt stack
@@ -155,9 +127,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
task = current;
if (!stack) {
- stack = &dummy;
- if (task && task != current)
+ if (regs)
+ stack = (unsigned long *)regs->sp;
+ else if (task && task != current)
stack = (unsigned long *)task->thread.sp;
+ else
+ stack = &dummy;
}
if (!bp)
@@ -205,8 +180,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
* pointer (index -1 to end) in the IRQ stack:
*/
stack = (unsigned long *) (irq_stack_end[-1]);
- bp = fixup_bp_irq_link(bp, stack, irq_stack,
- irq_stack_end);
irq_stack_end = NULL;
ops->stack(data, "EOI");
continue;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 8a445a0c989..e13329d800c 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -9,6 +9,8 @@
/*
* entry.S contains the system-call and fault low-level handling routines.
*
+ * Some of this is documented in Documentation/x86/entry_64.txt
+ *
* NOTE: This code handles signal-recognition, which happens every time
* after an interrupt and after each system call.
*
@@ -297,27 +299,26 @@ ENDPROC(native_usergs_sysret64)
.endm
/* save partial stack frame */
- .pushsection .kprobes.text, "ax"
-ENTRY(save_args)
- XCPT_FRAME
+ .macro SAVE_ARGS_IRQ
cld
- /*
- * start from rbp in pt_regs and jump over
- * return address.
- */
- movq_cfi rdi, RDI+8-RBP
- movq_cfi rsi, RSI+8-RBP
- movq_cfi rdx, RDX+8-RBP
- movq_cfi rcx, RCX+8-RBP
- movq_cfi rax, RAX+8-RBP
- movq_cfi r8, R8+8-RBP
- movq_cfi r9, R9+8-RBP
- movq_cfi r10, R10+8-RBP
- movq_cfi r11, R11+8-RBP
-
- leaq -RBP+8(%rsp),%rdi /* arg1 for handler */
- movq_cfi rbp, 8 /* push %rbp */
- leaq 8(%rsp), %rbp /* mov %rsp, %ebp */
+ /* start from rbp in pt_regs and jump over */
+ movq_cfi rdi, RDI-RBP
+ movq_cfi rsi, RSI-RBP
+ movq_cfi rdx, RDX-RBP
+ movq_cfi rcx, RCX-RBP
+ movq_cfi rax, RAX-RBP
+ movq_cfi r8, R8-RBP
+ movq_cfi r9, R9-RBP
+ movq_cfi r10, R10-RBP
+ movq_cfi r11, R11-RBP
+
+ /* Save rbp so that we can unwind from get_irq_regs() */
+ movq_cfi rbp, 0
+
+ /* Save previous stack value */
+ movq %rsp, %rsi
+
+ leaq -RBP(%rsp),%rdi /* arg1 for handler */
testl $3, CS(%rdi)
je 1f
SWAPGS
@@ -329,19 +330,14 @@ ENTRY(save_args)
*/
1: incl PER_CPU_VAR(irq_count)
jne 2f
- popq_cfi %rax /* move return address... */
mov PER_CPU_VAR(irq_stack_ptr),%rsp
EMPTY_FRAME 0
- pushq_cfi %rbp /* backlink for unwinder */
- pushq_cfi %rax /* ... to the new stack */
- /*
- * We entered an interrupt context - irqs are off:
- */
-2: TRACE_IRQS_OFF
- ret
- CFI_ENDPROC
-END(save_args)
- .popsection
+
+2: /* Store previous stack value */
+ pushq %rsi
+ /* We entered an interrupt context - irqs are off: */
+ TRACE_IRQS_OFF
+ .endm
ENTRY(save_rest)
PARTIAL_FRAME 1 REST_SKIP+8
@@ -473,7 +469,7 @@ ENTRY(system_call_after_swapgs)
* and short:
*/
ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_ARGS 8,1
+ SAVE_ARGS 8,0
movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
movq %rcx,RIP-ARGOFFSET(%rsp)
CFI_REL_OFFSET rip,RIP-ARGOFFSET
@@ -508,7 +504,7 @@ sysret_check:
TRACE_IRQS_ON
movq RIP-ARGOFFSET(%rsp),%rcx
CFI_REGISTER rip,rcx
- RESTORE_ARGS 0,-ARG_SKIP,1
+ RESTORE_ARGS 1,-ARG_SKIP,0
/*CFI_REGISTER rflags,r11*/
movq PER_CPU_VAR(old_rsp), %rsp
USERGS_SYSRET64
@@ -791,7 +787,7 @@ END(interrupt)
/* reserve pt_regs for scratch regs and rbp */
subq $ORIG_RAX-RBP, %rsp
CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
- call save_args
+ SAVE_ARGS_IRQ
PARTIAL_FRAME 0
call \func
.endm
@@ -814,15 +810,14 @@ ret_from_intr:
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
decl PER_CPU_VAR(irq_count)
- leaveq
- CFI_RESTORE rbp
+ /* Restore saved previous stack */
+ popq %rsi
+ leaq 16(%rsi), %rsp
+
CFI_DEF_CFA_REGISTER rsp
- CFI_ADJUST_CFA_OFFSET -8
+ CFI_ADJUST_CFA_OFFSET -16
- /* we did not save rbx, restore only from ARGOFFSET */
- addq $8, %rsp
- CFI_ADJUST_CFA_OFFSET -8
exit_intr:
GET_THREAD_INFO(%rcx)
testl $3,CS-ARGOFFSET(%rsp)
@@ -858,7 +853,7 @@ retint_restore_args: /* return to kernel space */
*/
TRACE_IRQS_IRETQ
restore_args:
- RESTORE_ARGS 0,8,0
+ RESTORE_ARGS 1,8,1
irq_return:
INTERRUPT_RETURN
@@ -991,11 +986,6 @@ apicinterrupt THRESHOLD_APIC_VECTOR \
apicinterrupt THERMAL_APIC_VECTOR \
thermal_interrupt smp_thermal_interrupt
-#ifdef CONFIG_X86_MCE
-apicinterrupt MCE_SELF_VECTOR \
- mce_self_interrupt smp_mce_self_interrupt
-#endif
-
#ifdef CONFIG_SMP
apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
call_function_single_interrupt smp_call_function_single_interrupt
@@ -1121,6 +1111,8 @@ zeroentry spurious_interrupt_bug do_spurious_interrupt_bug
zeroentry coprocessor_error do_coprocessor_error
errorentry alignment_check do_alignment_check
zeroentry simd_coprocessor_error do_simd_coprocessor_error
+zeroentry emulate_vsyscall do_emulate_vsyscall
+
/* Reload gs selector with exception handling */
/* edi: new selector */
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index a93742a5746..c9a281f272f 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -123,7 +123,7 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
static atomic_t nmi_running = ATOMIC_INIT(0);
static int mod_code_status; /* holds return value of text write */
static void *mod_code_ip; /* holds the IP to write to */
-static void *mod_code_newcode; /* holds the text to write to the IP */
+static const void *mod_code_newcode; /* holds the text to write to the IP */
static unsigned nmi_wait_count;
static atomic_t nmi_update_count = ATOMIC_INIT(0);
@@ -225,7 +225,7 @@ within(unsigned long addr, unsigned long start, unsigned long end)
}
static int
-do_ftrace_mod_code(unsigned long ip, void *new_code)
+do_ftrace_mod_code(unsigned long ip, const void *new_code)
{
/*
* On x86_64, kernel text mappings are mapped read-only with
@@ -260,14 +260,14 @@ do_ftrace_mod_code(unsigned long ip, void *new_code)
return mod_code_status;
}
-static unsigned char *ftrace_nop_replace(void)
+static const unsigned char *ftrace_nop_replace(void)
{
- return ideal_nop5;
+ return ideal_nops[NOP_ATOMIC5];
}
static int
-ftrace_modify_code(unsigned long ip, unsigned char *old_code,
- unsigned char *new_code)
+ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
+ unsigned const char *new_code)
{
unsigned char replaced[MCOUNT_INSN_SIZE];
@@ -301,7 +301,7 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
int ftrace_make_nop(struct module *mod,
struct dyn_ftrace *rec, unsigned long addr)
{
- unsigned char *new, *old;
+ unsigned const char *new, *old;
unsigned long ip = rec->ip;
old = ftrace_call_replace(ip, addr);
@@ -312,7 +312,7 @@ int ftrace_make_nop(struct module *mod,
int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
{
- unsigned char *new, *old;
+ unsigned const char *new, *old;
unsigned long ip = rec->ip;
old = ftrace_nop_replace();
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index d6d6bb36193..3bb08509a7a 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -23,7 +23,6 @@
static void __init i386_default_early_setup(void)
{
/* Initialize 32bit specific setup functions */
- x86_init.resources.probe_roms = probe_roms;
x86_init.resources.reserve_resources = i386_reserve_resources;
x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index bfe8f729e08..4aecc54236a 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -4,6 +4,7 @@
#include <linux/sysdev.h>
#include <linux/delay.h>
#include <linux/errno.h>
+#include <linux/i8253.h>
#include <linux/slab.h>
#include <linux/hpet.h>
#include <linux/init.h>
@@ -12,8 +13,8 @@
#include <linux/io.h>
#include <asm/fixmap.h>
-#include <asm/i8253.h>
#include <asm/hpet.h>
+#include <asm/time.h>
#define HPET_MASK CLOCKSOURCE_MASK(32)
@@ -71,7 +72,7 @@ static inline void hpet_set_mapping(void)
{
hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
#ifdef CONFIG_X86_64
- __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
+ __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VVAR_NOCACHE);
#endif
}
@@ -217,7 +218,7 @@ static void hpet_reserve_platform_timers(unsigned int id) { }
/*
* Common hpet info
*/
-static unsigned long hpet_period;
+static unsigned long hpet_freq;
static void hpet_legacy_set_mode(enum clock_event_mode mode,
struct clock_event_device *evt);
@@ -232,7 +233,6 @@ static struct clock_event_device hpet_clockevent = {
.features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
.set_mode = hpet_legacy_set_mode,
.set_next_event = hpet_legacy_next_event,
- .shift = 32,
.irq = 0,
.rating = 50,
};
@@ -290,28 +290,12 @@ static void hpet_legacy_clockevent_register(void)
hpet_enable_legacy_int();
/*
- * The mult factor is defined as (include/linux/clockchips.h)
- * mult/2^shift = cyc/ns (in contrast to ns/cyc in clocksource.h)
- * hpet_period is in units of femtoseconds (per cycle), so
- * mult/2^shift = cyc/ns = 10^6/hpet_period
- * mult = (10^6 * 2^shift)/hpet_period
- * mult = (FSEC_PER_NSEC << hpet_clockevent.shift)/hpet_period
- */
- hpet_clockevent.mult = div_sc((unsigned long) FSEC_PER_NSEC,
- hpet_period, hpet_clockevent.shift);
- /* Calculate the min / max delta */
- hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
- &hpet_clockevent);
- /* Setup minimum reprogramming delta. */
- hpet_clockevent.min_delta_ns = clockevent_delta2ns(HPET_MIN_PROG_DELTA,
- &hpet_clockevent);
-
- /*
* Start hpet with the boot cpu mask and make it
* global after the IO_APIC has been initialized.
*/
hpet_clockevent.cpumask = cpumask_of(smp_processor_id());
- clockevents_register_device(&hpet_clockevent);
+ clockevents_config_and_register(&hpet_clockevent, hpet_freq,
+ HPET_MIN_PROG_DELTA, 0x7FFFFFFF);
global_clock_event = &hpet_clockevent;
printk(KERN_DEBUG "hpet clockevent registered\n");
}
@@ -549,7 +533,6 @@ static int hpet_setup_irq(struct hpet_dev *dev)
static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
{
struct clock_event_device *evt = &hdev->evt;
- uint64_t hpet_freq;
WARN_ON(cpu != smp_processor_id());
if (!(hdev->flags & HPET_DEV_VALID))
@@ -571,24 +554,10 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
evt->set_mode = hpet_msi_set_mode;
evt->set_next_event = hpet_msi_next_event;
- evt->shift = 32;
-
- /*
- * The period is a femto seconds value. We need to calculate the
- * scaled math multiplication factor for nanosecond to hpet tick
- * conversion.
- */
- hpet_freq = FSEC_PER_SEC;
- do_div(hpet_freq, hpet_period);
- evt->mult = div_sc((unsigned long) hpet_freq,
- NSEC_PER_SEC, evt->shift);
- /* Calculate the max delta */
- evt->max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, evt);
- /* 5 usec minimum reprogramming delta. */
- evt->min_delta_ns = 5000;
-
evt->cpumask = cpumask_of(hdev->cpu);
- clockevents_register_device(evt);
+
+ clockevents_config_and_register(evt, hpet_freq, HPET_MIN_PROG_DELTA,
+ 0x7FFFFFFF);
}
#ifdef CONFIG_HPET
@@ -770,13 +739,6 @@ static cycle_t read_hpet(struct clocksource *cs)
return (cycle_t)hpet_readl(HPET_COUNTER);
}
-#ifdef CONFIG_X86_64
-static cycle_t __vsyscall_fn vread_hpet(void)
-{
- return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
-}
-#endif
-
static struct clocksource clocksource_hpet = {
.name = "hpet",
.rating = 250,
@@ -785,14 +747,13 @@ static struct clocksource clocksource_hpet = {
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
.resume = hpet_resume_counter,
#ifdef CONFIG_X86_64
- .vread = vread_hpet,
+ .archdata = { .vclock_mode = VCLOCK_HPET },
#endif
};
static int hpet_clocksource_register(void)
{
u64 start, now;
- u64 hpet_freq;
cycle_t t1;
/* Start the counter */
@@ -819,24 +780,7 @@ static int hpet_clocksource_register(void)
return -ENODEV;
}
- /*
- * The definition of mult is (include/linux/clocksource.h)
- * mult/2^shift = ns/cyc and hpet_period is in units of fsec/cyc
- * so we first need to convert hpet_period to ns/cyc units:
- * mult/2^shift = ns/cyc = hpet_period/10^6
- * mult = (hpet_period * 2^shift)/10^6
- * mult = (hpet_period << shift)/FSEC_PER_NSEC
- */
-
- /* Need to convert hpet_period (fsec/cyc) to cyc/sec:
- *
- * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc)
- * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period
- */
- hpet_freq = FSEC_PER_SEC;
- do_div(hpet_freq, hpet_period);
clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
-
return 0;
}
@@ -845,7 +789,9 @@ static int hpet_clocksource_register(void)
*/
int __init hpet_enable(void)
{
+ unsigned long hpet_period;
unsigned int id;
+ u64 freq;
int i;
if (!is_hpet_capable())
@@ -884,6 +830,14 @@ int __init hpet_enable(void)
goto out_nohpet;
/*
+ * The period is a femto seconds value. Convert it to a
+ * frequency.
+ */
+ freq = FSEC_PER_SEC;
+ do_div(freq, hpet_period);
+ hpet_freq = freq;
+
+ /*
* Read the HPET ID register to retrieve the IRQ routing
* information and the number of channels
*/
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 12aff253768..739d8598f78 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -321,7 +321,7 @@ static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
return tmp;
}
-#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16);
+#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16)
#define FP_EXP_TAG_VALID 0
#define FP_EXP_TAG_ZERO 1
#define FP_EXP_TAG_SPECIAL 2
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 2dfd3159744..f2b96de3c7c 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -3,195 +3,27 @@
*
*/
#include <linux/clockchips.h>
-#include <linux/interrupt.h>
-#include <linux/spinlock.h>
-#include <linux/jiffies.h>
#include <linux/module.h>
#include <linux/timex.h>
-#include <linux/delay.h>
-#include <linux/init.h>
-#include <linux/io.h>
+#include <linux/i8253.h>
-#include <asm/i8253.h>
#include <asm/hpet.h>
+#include <asm/time.h>
#include <asm/smp.h>
-DEFINE_RAW_SPINLOCK(i8253_lock);
-EXPORT_SYMBOL(i8253_lock);
-
/*
* HPET replaces the PIT, when enabled. So we need to know, which of
* the two timers is used
*/
struct clock_event_device *global_clock_event;
-/*
- * Initialize the PIT timer.
- *
- * This is also called after resume to bring the PIT into operation again.
- */
-static void init_pit_timer(enum clock_event_mode mode,
- struct clock_event_device *evt)
-{
- raw_spin_lock(&i8253_lock);
-
- switch (mode) {
- case CLOCK_EVT_MODE_PERIODIC:
- /* binary, mode 2, LSB/MSB, ch 0 */
- outb_pit(0x34, PIT_MODE);
- outb_pit(LATCH & 0xff , PIT_CH0); /* LSB */
- outb_pit(LATCH >> 8 , PIT_CH0); /* MSB */
- break;
-
- case CLOCK_EVT_MODE_SHUTDOWN:
- case CLOCK_EVT_MODE_UNUSED:
- if (evt->mode == CLOCK_EVT_MODE_PERIODIC ||
- evt->mode == CLOCK_EVT_MODE_ONESHOT) {
- outb_pit(0x30, PIT_MODE);
- outb_pit(0, PIT_CH0);
- outb_pit(0, PIT_CH0);
- }
- break;
-
- case CLOCK_EVT_MODE_ONESHOT:
- /* One shot setup */
- outb_pit(0x38, PIT_MODE);
- break;
-
- case CLOCK_EVT_MODE_RESUME:
- /* Nothing to do here */
- break;
- }
- raw_spin_unlock(&i8253_lock);
-}
-
-/*
- * Program the next event in oneshot mode
- *
- * Delta is given in PIT ticks
- */
-static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
-{
- raw_spin_lock(&i8253_lock);
- outb_pit(delta & 0xff , PIT_CH0); /* LSB */
- outb_pit(delta >> 8 , PIT_CH0); /* MSB */
- raw_spin_unlock(&i8253_lock);
-
- return 0;
-}
-
-/*
- * On UP the PIT can serve all of the possible timer functions. On SMP systems
- * it can be solely used for the global tick.
- *
- * The profiling and update capabilities are switched off once the local apic is
- * registered. This mechanism replaces the previous #ifdef LOCAL_APIC -
- * !using_apic_timer decisions in do_timer_interrupt_hook()
- */
-static struct clock_event_device pit_ce = {
- .name = "pit",
- .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
- .set_mode = init_pit_timer,
- .set_next_event = pit_next_event,
- .shift = 32,
- .irq = 0,
-};
-
-/*
- * Initialize the conversion factor and the min/max deltas of the clock event
- * structure and register the clock event source with the framework.
- */
void __init setup_pit_timer(void)
{
- /*
- * Start pit with the boot cpu mask and make it global after the
- * IO_APIC has been initialized.
- */
- pit_ce.cpumask = cpumask_of(smp_processor_id());
- pit_ce.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, pit_ce.shift);
- pit_ce.max_delta_ns = clockevent_delta2ns(0x7FFF, &pit_ce);
- pit_ce.min_delta_ns = clockevent_delta2ns(0xF, &pit_ce);
-
- clockevents_register_device(&pit_ce);
- global_clock_event = &pit_ce;
+ clockevent_i8253_init(true);
+ global_clock_event = &i8253_clockevent;
}
#ifndef CONFIG_X86_64
-/*
- * Since the PIT overflows every tick, its not very useful
- * to just read by itself. So use jiffies to emulate a free
- * running counter:
- */
-static cycle_t pit_read(struct clocksource *cs)
-{
- static int old_count;
- static u32 old_jifs;
- unsigned long flags;
- int count;
- u32 jifs;
-
- raw_spin_lock_irqsave(&i8253_lock, flags);
- /*
- * Although our caller may have the read side of xtime_lock,
- * this is now a seqlock, and we are cheating in this routine
- * by having side effects on state that we cannot undo if
- * there is a collision on the seqlock and our caller has to
- * retry. (Namely, old_jifs and old_count.) So we must treat
- * jiffies as volatile despite the lock. We read jiffies
- * before latching the timer count to guarantee that although
- * the jiffies value might be older than the count (that is,
- * the counter may underflow between the last point where
- * jiffies was incremented and the point where we latch the
- * count), it cannot be newer.
- */
- jifs = jiffies;
- outb_pit(0x00, PIT_MODE); /* latch the count ASAP */
- count = inb_pit(PIT_CH0); /* read the latched count */
- count |= inb_pit(PIT_CH0) << 8;
-
- /* VIA686a test code... reset the latch if count > max + 1 */
- if (count > LATCH) {
- outb_pit(0x34, PIT_MODE);
- outb_pit(LATCH & 0xff, PIT_CH0);
- outb_pit(LATCH >> 8, PIT_CH0);
- count = LATCH - 1;
- }
-
- /*
- * It's possible for count to appear to go the wrong way for a
- * couple of reasons:
- *
- * 1. The timer counter underflows, but we haven't handled the
- * resulting interrupt and incremented jiffies yet.
- * 2. Hardware problem with the timer, not giving us continuous time,
- * the counter does small "jumps" upwards on some Pentium systems,
- * (see c't 95/10 page 335 for Neptun bug.)
- *
- * Previous attempts to handle these cases intelligently were
- * buggy, so we just do the simple thing now.
- */
- if (count > old_count && jifs == old_jifs)
- count = old_count;
-
- old_count = count;
- old_jifs = jifs;
-
- raw_spin_unlock_irqrestore(&i8253_lock, flags);
-
- count = (LATCH - 1) - count;
-
- return (cycle_t)(jifs * LATCH) + count;
-}
-
-static struct clocksource pit_cs = {
- .name = "pit",
- .rating = 110,
- .read = pit_read,
- .mask = CLOCKSOURCE_MASK(32),
- .mult = 0,
- .shift = 20,
-};
-
static int __init init_pit_clocksource(void)
{
/*
@@ -202,13 +34,10 @@ static int __init init_pit_clocksource(void)
* - when local APIC timer is active (PIT is switched off)
*/
if (num_possible_cpus() > 1 || is_hpet_enabled() ||
- pit_ce.mode != CLOCK_EVT_MODE_PERIODIC)
+ i8253_clockevent.mode != CLOCK_EVT_MODE_PERIODIC)
return 0;
- pit_cs.mult = clocksource_hz2mult(CLOCK_TICK_RATE, pit_cs.shift);
-
- return clocksource_register(&pit_cs);
+ return clocksource_i8253_init();
}
arch_initcall(init_pit_clocksource);
-
#endif /* !CONFIG_X86_64 */
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 65b8f5c2eeb..610485223bd 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -14,7 +14,7 @@
#include <linux/io.h>
#include <linux/delay.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <asm/system.h>
#include <asm/timer.h>
#include <asm/hw_irq.h>
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 1cb0b9fc78d..6c0802eb2f7 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -249,7 +249,7 @@ void fixup_irqs(void)
data = irq_desc_get_irq_data(desc);
affinity = data->affinity;
- if (!irq_has_action(irq) ||
+ if (!irq_has_action(irq) || irqd_is_per_cpu(data) ||
cpumask_subset(affinity, cpu_online_mask)) {
raw_spin_unlock(&desc->lock);
continue;
@@ -276,7 +276,8 @@ void fixup_irqs(void)
else if (!(warned++))
set_affinity = 0;
- if (!irqd_can_move_in_process_context(data) && chip->irq_unmask)
+ if (!irqd_can_move_in_process_context(data) &&
+ !irqd_irq_disabled(data) && chip->irq_unmask)
chip->irq_unmask(data);
raw_spin_unlock(&desc->lock);
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index f470e4ef993..b3300e6bace 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -15,7 +15,7 @@
#include <linux/io.h>
#include <linux/delay.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <asm/system.h>
#include <asm/timer.h>
#include <asm/hw_irq.h>
@@ -272,9 +272,6 @@ static void __init apic_intr_init(void)
#ifdef CONFIG_X86_MCE_THRESHOLD
alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
#endif
-#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC)
- alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
-#endif
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
/* self generated IPI for local APIC timer */
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index 961b6b30ba9..3fee346ef54 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -34,7 +34,7 @@ void arch_jump_label_transform(struct jump_entry *entry,
code.offset = entry->target -
(entry->code + JUMP_LABEL_NOP_SIZE);
} else
- memcpy(&code, ideal_nop5, JUMP_LABEL_NOP_SIZE);
+ memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE);
get_online_cpus();
mutex_lock(&text_mutex);
text_poke_smp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE);
@@ -44,7 +44,8 @@ void arch_jump_label_transform(struct jump_entry *entry,
void arch_jump_label_text_poke_early(jump_label_t addr)
{
- text_poke_early((void *)addr, ideal_nop5, JUMP_LABEL_NOP_SIZE);
+ text_poke_early((void *)addr, ideal_nops[NOP_ATOMIC5],
+ JUMP_LABEL_NOP_SIZE);
}
#endif
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 5f9ecff328b..00354d4919a 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -608,7 +608,7 @@ int kgdb_arch_init(void)
return register_die_notifier(&kgdb_notifier);
}
-static void kgdb_hw_overflow_handler(struct perf_event *event, int nmi,
+static void kgdb_hw_overflow_handler(struct perf_event *event,
struct perf_sample_data *data, struct pt_regs *regs)
{
struct task_struct *tsk = current;
@@ -638,7 +638,7 @@ void kgdb_arch_late(void)
for (i = 0; i < HBP_NUM; i++) {
if (breakinfo[i].pev)
continue;
- breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
+ breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL, NULL);
if (IS_ERR((void * __force)breakinfo[i].pev)) {
printk(KERN_ERR "kgdb: Could not allocate hw"
"breakpoints\nDisabling the kernel debugger\n");
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index c969fd9d156..f1a6244d7d9 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -1183,12 +1183,13 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op,
struct pt_regs *regs)
{
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+ unsigned long flags;
/* This is possible if op is under delayed unoptimizing */
if (kprobe_disabled(&op->kp))
return;
- preempt_disable();
+ local_irq_save(flags);
if (kprobe_running()) {
kprobes_inc_nmissed_count(&op->kp);
} else {
@@ -1207,7 +1208,7 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op,
opt_pre_handler(&op->kp, regs);
__this_cpu_write(current_kprobe, NULL);
}
- preempt_enable_no_resched();
+ local_irq_restore(flags);
}
static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 33c07b0b122..a9c2116001d 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -51,6 +51,15 @@ static int parse_no_kvmapf(char *arg)
early_param("no-kvmapf", parse_no_kvmapf);
+static int steal_acc = 1;
+static int parse_no_stealacc(char *arg)
+{
+ steal_acc = 0;
+ return 0;
+}
+
+early_param("no-steal-acc", parse_no_stealacc);
+
struct kvm_para_state {
u8 mmu_queue[MMU_QUEUE_SIZE];
int mmu_queue_len;
@@ -58,6 +67,8 @@ struct kvm_para_state {
static DEFINE_PER_CPU(struct kvm_para_state, para_state);
static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
+static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
+static int has_steal_clock = 0;
static struct kvm_para_state *kvm_para_state(void)
{
@@ -441,6 +452,21 @@ static void __init paravirt_ops_setup(void)
#endif
}
+static void kvm_register_steal_time(void)
+{
+ int cpu = smp_processor_id();
+ struct kvm_steal_time *st = &per_cpu(steal_time, cpu);
+
+ if (!has_steal_clock)
+ return;
+
+ memset(st, 0, sizeof(*st));
+
+ wrmsrl(MSR_KVM_STEAL_TIME, (__pa(st) | KVM_MSR_ENABLED));
+ printk(KERN_INFO "kvm-stealtime: cpu %d, msr %lx\n",
+ cpu, __pa(st));
+}
+
void __cpuinit kvm_guest_cpu_init(void)
{
if (!kvm_para_available())
@@ -457,6 +483,9 @@ void __cpuinit kvm_guest_cpu_init(void)
printk(KERN_INFO"KVM setup async PF for cpu %d\n",
smp_processor_id());
}
+
+ if (has_steal_clock)
+ kvm_register_steal_time();
}
static void kvm_pv_disable_apf(void *unused)
@@ -483,6 +512,31 @@ static struct notifier_block kvm_pv_reboot_nb = {
.notifier_call = kvm_pv_reboot_notify,
};
+static u64 kvm_steal_clock(int cpu)
+{
+ u64 steal;
+ struct kvm_steal_time *src;
+ int version;
+
+ src = &per_cpu(steal_time, cpu);
+ do {
+ version = src->version;
+ rmb();
+ steal = src->steal;
+ rmb();
+ } while ((version & 1) || (version != src->version));
+
+ return steal;
+}
+
+void kvm_disable_steal_time(void)
+{
+ if (!has_steal_clock)
+ return;
+
+ wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
+}
+
#ifdef CONFIG_SMP
static void __init kvm_smp_prepare_boot_cpu(void)
{
@@ -500,6 +554,7 @@ static void __cpuinit kvm_guest_cpu_online(void *dummy)
static void kvm_guest_cpu_offline(void *dummy)
{
+ kvm_disable_steal_time();
kvm_pv_disable_apf(NULL);
apf_task_wake_all();
}
@@ -548,6 +603,11 @@ void __init kvm_guest_init(void)
if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
x86_init.irqs.trap_init = kvm_apf_trap_init;
+ if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
+ has_steal_clock = 1;
+ pv_time_ops.steal_clock = kvm_steal_clock;
+ }
+
#ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
register_cpu_notifier(&kvm_cpu_notifier);
@@ -555,3 +615,15 @@ void __init kvm_guest_init(void)
kvm_guest_cpu_init();
#endif
}
+
+static __init int activate_jump_labels(void)
+{
+ if (has_steal_clock) {
+ jump_label_inc(&paravirt_steal_enabled);
+ if (steal_acc)
+ jump_label_inc(&paravirt_steal_rq_enabled);
+ }
+
+ return 0;
+}
+arch_initcall(activate_jump_labels);
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index f98d3eafe07..c1a0188e29a 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -26,8 +26,6 @@
#include <asm/x86_init.h>
#include <asm/reboot.h>
-#define KVM_SCALE 22
-
static int kvmclock = 1;
static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
@@ -120,8 +118,6 @@ static struct clocksource kvm_clock = {
.read = kvm_clock_get_cycles,
.rating = 400,
.mask = CLOCKSOURCE_MASK(64),
- .mult = 1 << KVM_SCALE,
- .shift = KVM_SCALE,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
@@ -164,6 +160,7 @@ static void __cpuinit kvm_setup_secondary_clock(void)
static void kvm_crash_shutdown(struct pt_regs *regs)
{
native_write_msr(msr_kvm_system_time, 0, 0);
+ kvm_disable_steal_time();
native_machine_crash_shutdown(regs);
}
#endif
@@ -171,6 +168,7 @@ static void kvm_crash_shutdown(struct pt_regs *regs)
static void kvm_shutdown(void)
{
native_write_msr(msr_kvm_system_time, 0, 0);
+ kvm_disable_steal_time();
native_machine_shutdown();
}
@@ -203,7 +201,7 @@ void __init kvmclock_init(void)
machine_ops.crash_shutdown = kvm_crash_shutdown;
#endif
kvm_get_preset_lpj();
- clocksource_register(&kvm_clock);
+ clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
pv_info.paravirt_enabled = 1;
pv_info.name = "KVM";
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index c5610384ab1..591be0ee193 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -66,8 +66,8 @@ struct microcode_amd {
unsigned int mpb[0];
};
-#define UCODE_CONTAINER_SECTION_HDR 8
-#define UCODE_CONTAINER_HEADER_SIZE 12
+#define SECTION_HDR_SIZE 8
+#define CONTAINER_HDR_SZ 12
static struct equiv_cpu_entry *equiv_cpu_table;
@@ -157,7 +157,7 @@ static int apply_microcode_amd(int cpu)
static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size)
{
struct cpuinfo_x86 *c = &cpu_data(cpu);
- unsigned int max_size, actual_size;
+ u32 max_size, actual_size;
#define F1XH_MPB_MAX_SIZE 2048
#define F14H_MPB_MAX_SIZE 1824
@@ -175,9 +175,9 @@ static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size)
break;
}
- actual_size = buf[4] + (buf[5] << 8);
+ actual_size = *(u32 *)(buf + 4);
- if (actual_size > size || actual_size > max_size) {
+ if (actual_size + SECTION_HDR_SIZE > size || actual_size > max_size) {
pr_err("section size mismatch\n");
return 0;
}
@@ -191,7 +191,7 @@ get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size)
struct microcode_header_amd *mc = NULL;
unsigned int actual_size = 0;
- if (buf[0] != UCODE_UCODE_TYPE) {
+ if (*(u32 *)buf != UCODE_UCODE_TYPE) {
pr_err("invalid type field in container file section header\n");
goto out;
}
@@ -204,8 +204,8 @@ get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size)
if (!mc)
goto out;
- get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, actual_size);
- *mc_size = actual_size + UCODE_CONTAINER_SECTION_HDR;
+ get_ucode_data(mc, buf + SECTION_HDR_SIZE, actual_size);
+ *mc_size = actual_size + SECTION_HDR_SIZE;
out:
return mc;
@@ -229,9 +229,10 @@ static int install_equiv_cpu_table(const u8 *buf)
return -ENOMEM;
}
- get_ucode_data(equiv_cpu_table, buf + UCODE_CONTAINER_HEADER_SIZE, size);
+ get_ucode_data(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size);
- return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
+ /* add header length */
+ return size + CONTAINER_HDR_SZ;
}
static void free_equiv_cpu_table(void)
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index ab23f1ad4bf..925179f871d 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -24,6 +24,7 @@
#include <linux/bug.h>
#include <linux/mm.h>
#include <linux/gfp.h>
+#include <linux/jump_label.h>
#include <asm/system.h>
#include <asm/page.h>
@@ -44,21 +45,6 @@ void *module_alloc(unsigned long size)
-1, __builtin_return_address(0));
}
-/* Free memory returned from module_alloc */
-void module_free(struct module *mod, void *module_region)
-{
- vfree(module_region);
-}
-
-/* We don't need anything special. */
-int module_frob_arch_sections(Elf_Ehdr *hdr,
- Elf_Shdr *sechdrs,
- char *secstrings,
- struct module *mod)
-{
- return 0;
-}
-
#ifdef CONFIG_X86_32
int apply_relocate(Elf32_Shdr *sechdrs,
const char *strtab,
@@ -99,17 +85,6 @@ int apply_relocate(Elf32_Shdr *sechdrs,
}
return 0;
}
-
-int apply_relocate_add(Elf32_Shdr *sechdrs,
- const char *strtab,
- unsigned int symindex,
- unsigned int relsec,
- struct module *me)
-{
- printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n",
- me->name);
- return -ENOEXEC;
-}
#else /*X86_64*/
int apply_relocate_add(Elf64_Shdr *sechdrs,
const char *strtab,
@@ -180,17 +155,6 @@ overflow:
me->name);
return -ENOEXEC;
}
-
-int apply_relocate(Elf_Shdr *sechdrs,
- const char *strtab,
- unsigned int symindex,
- unsigned int relsec,
- struct module *me)
-{
- printk(KERN_ERR "non add relocation not supported\n");
- return -ENOSYS;
-}
-
#endif
int module_finalize(const Elf_Ehdr *hdr,
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 5a532ce646b..9103b89c145 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -285,7 +285,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
intsrc.type = MP_INTSRC;
intsrc.irqflag = 0; /* conforming */
intsrc.srcbus = 0;
- intsrc.dstapic = mp_ioapics[0].apicid;
+ intsrc.dstapic = mpc_ioapic_id(0);
intsrc.irqtype = mp_INT;
@@ -715,17 +715,15 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
}
}
-static int
+static int __init
check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
{
- int ret = 0;
-
if (!mpc_new_phys || count <= mpc_new_length) {
WARN(1, "update_mptable: No spare slots (length: %x)\n", count);
return -1;
}
- return ret;
+ return 0;
}
#else /* CONFIG_X86_IO_APIC */
static
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 869e1aeeb71..613a7931ecc 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -202,6 +202,14 @@ static void native_flush_tlb_single(unsigned long addr)
__native_flush_tlb_single(addr);
}
+struct jump_label_key paravirt_steal_enabled;
+struct jump_label_key paravirt_steal_rq_enabled;
+
+static u64 native_steal_clock(int cpu)
+{
+ return 0;
+}
+
/* These are in entry.S */
extern void native_iret(void);
extern void native_irq_enable_sysexit(void);
@@ -307,6 +315,7 @@ struct pv_init_ops pv_init_ops = {
struct pv_time_ops pv_time_ops = {
.sched_clock = native_sched_clock,
+ .steal_clock = native_steal_clock,
};
struct pv_irq_ops pv_irq_ops = {
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index e8c33a30200..726494b5834 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -1553,7 +1553,7 @@ static void __init calgary_fixup_one_tce_space(struct pci_dev *dev)
continue;
/* cover the whole region */
- npages = (r->end - r->start) >> PAGE_SHIFT;
+ npages = resource_size(r) >> PAGE_SHIFT;
npages++;
iommu_range_reserve(tbl, r->start, npages);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 9ea999a4dcc..b49d00da2ae 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -68,74 +68,10 @@ int dma_set_mask(struct device *dev, u64 mask)
}
EXPORT_SYMBOL(dma_set_mask);
-#if defined(CONFIG_X86_64) && !defined(CONFIG_NUMA)
-static __initdata void *dma32_bootmem_ptr;
-static unsigned long dma32_bootmem_size __initdata = (128ULL<<20);
-
-static int __init parse_dma32_size_opt(char *p)
-{
- if (!p)
- return -EINVAL;
- dma32_bootmem_size = memparse(p, &p);
- return 0;
-}
-early_param("dma32_size", parse_dma32_size_opt);
-
-void __init dma32_reserve_bootmem(void)
-{
- unsigned long size, align;
- if (max_pfn <= MAX_DMA32_PFN)
- return;
-
- /*
- * check aperture_64.c allocate_aperture() for reason about
- * using 512M as goal
- */
- align = 64ULL<<20;
- size = roundup(dma32_bootmem_size, align);
- dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
- 512ULL<<20);
- /*
- * Kmemleak should not scan this block as it may not be mapped via the
- * kernel direct mapping.
- */
- kmemleak_ignore(dma32_bootmem_ptr);
- if (dma32_bootmem_ptr)
- dma32_bootmem_size = size;
- else
- dma32_bootmem_size = 0;
-}
-static void __init dma32_free_bootmem(void)
-{
-
- if (max_pfn <= MAX_DMA32_PFN)
- return;
-
- if (!dma32_bootmem_ptr)
- return;
-
- free_bootmem(__pa(dma32_bootmem_ptr), dma32_bootmem_size);
-
- dma32_bootmem_ptr = NULL;
- dma32_bootmem_size = 0;
-}
-#else
-void __init dma32_reserve_bootmem(void)
-{
-}
-static void __init dma32_free_bootmem(void)
-{
-}
-
-#endif
-
void __init pci_iommu_alloc(void)
{
struct iommu_table_entry *p;
- /* free the range so iommu could get some range less than 4G */
- dma32_free_bootmem();
-
sort_iommu_table(__iommu_table, __iommu_table_end);
check_iommu_entries(__iommu_table, __iommu_table_end);
diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c
index 55d745ec118..35ccf75696e 100644
--- a/arch/x86/kernel/pci-iommu_table.c
+++ b/arch/x86/kernel/pci-iommu_table.c
@@ -50,20 +50,14 @@ void __init check_iommu_entries(struct iommu_table_entry *start,
struct iommu_table_entry *finish)
{
struct iommu_table_entry *p, *q, *x;
- char sym_p[KSYM_SYMBOL_LEN];
- char sym_q[KSYM_SYMBOL_LEN];
/* Simple cyclic dependency checker. */
for (p = start; p < finish; p++) {
q = find_dependents_of(start, finish, p);
x = find_dependents_of(start, finish, q);
if (p == x) {
- sprint_symbol(sym_p, (unsigned long)p->detect);
- sprint_symbol(sym_q, (unsigned long)q->detect);
-
- printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %s depends" \
- " on %s and vice-versa. BREAKING IT.\n",
- sym_p, sym_q);
+ printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %pS depends on %pS and vice-versa. BREAKING IT.\n",
+ p->detect, q->detect);
/* Heavy handed way..*/
x->depend = 0;
}
@@ -72,12 +66,8 @@ void __init check_iommu_entries(struct iommu_table_entry *start,
for (p = start; p < finish; p++) {
q = find_dependents_of(p, finish, p);
if (q && q > p) {
- sprint_symbol(sym_p, (unsigned long)p->detect);
- sprint_symbol(sym_q, (unsigned long)q->detect);
-
- printk(KERN_ERR "EXECUTION ORDER INVALID! %s "\
- "should be called before %s!\n",
- sym_p, sym_q);
+ printk(KERN_ERR "EXECUTION ORDER INVALID! %pS should be called before %pS!\n",
+ p->detect, q->detect);
}
}
}
diff --git a/arch/x86/kernel/probe_roms_32.c b/arch/x86/kernel/probe_roms.c
index 071e7fea42e..63228035f9d 100644
--- a/arch/x86/kernel/probe_roms_32.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -73,6 +73,107 @@ static struct resource video_rom_resource = {
.flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
};
+/* does this oprom support the given pci device, or any of the devices
+ * that the driver supports?
+ */
+static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short device)
+{
+ struct pci_driver *drv = pdev->driver;
+ const struct pci_device_id *id;
+
+ if (pdev->vendor == vendor && pdev->device == device)
+ return true;
+
+ for (id = drv ? drv->id_table : NULL; id && id->vendor; id++)
+ if (id->vendor == vendor && id->device == device)
+ break;
+
+ return id && id->vendor;
+}
+
+static bool probe_list(struct pci_dev *pdev, unsigned short vendor,
+ const unsigned char *rom_list)
+{
+ unsigned short device;
+
+ do {
+ if (probe_kernel_address(rom_list, device) != 0)
+ device = 0;
+
+ if (device && match_id(pdev, vendor, device))
+ break;
+
+ rom_list += 2;
+ } while (device);
+
+ return !!device;
+}
+
+static struct resource *find_oprom(struct pci_dev *pdev)
+{
+ struct resource *oprom = NULL;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(adapter_rom_resources); i++) {
+ struct resource *res = &adapter_rom_resources[i];
+ unsigned short offset, vendor, device, list, rev;
+ const unsigned char *rom;
+
+ if (res->end == 0)
+ break;
+
+ rom = isa_bus_to_virt(res->start);
+ if (probe_kernel_address(rom + 0x18, offset) != 0)
+ continue;
+
+ if (probe_kernel_address(rom + offset + 0x4, vendor) != 0)
+ continue;
+
+ if (probe_kernel_address(rom + offset + 0x6, device) != 0)
+ continue;
+
+ if (match_id(pdev, vendor, device)) {
+ oprom = res;
+ break;
+ }
+
+ if (probe_kernel_address(rom + offset + 0x8, list) == 0 &&
+ probe_kernel_address(rom + offset + 0xc, rev) == 0 &&
+ rev >= 3 && list &&
+ probe_list(pdev, vendor, rom + offset + list)) {
+ oprom = res;
+ break;
+ }
+ }
+
+ return oprom;
+}
+
+void *pci_map_biosrom(struct pci_dev *pdev)
+{
+ struct resource *oprom = find_oprom(pdev);
+
+ if (!oprom)
+ return NULL;
+
+ return ioremap(oprom->start, resource_size(oprom));
+}
+EXPORT_SYMBOL(pci_map_biosrom);
+
+void pci_unmap_biosrom(void __iomem *image)
+{
+ iounmap(image);
+}
+EXPORT_SYMBOL(pci_unmap_biosrom);
+
+size_t pci_biosrom_size(struct pci_dev *pdev)
+{
+ struct resource *oprom = find_oprom(pdev);
+
+ return oprom ? resource_size(oprom) : 0;
+}
+EXPORT_SYMBOL(pci_biosrom_size);
+
#define ROMSIGNATURE 0xaa55
static int __init romsignature(const unsigned char *rom)
@@ -133,7 +234,7 @@ void __init probe_roms(void)
/* check for extension rom (ignore length byte!) */
rom = isa_bus_to_virt(extension_rom_resource.start);
if (romsignature(rom)) {
- length = extension_rom_resource.end - extension_rom_resource.start + 1;
+ length = resource_size(&extension_rom_resource);
if (romchecksum(rom, length)) {
request_resource(&iomem_resource, &extension_rom_resource);
upper = extension_rom_resource.start;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index d46cbe46b7a..e1ba8cb24e4 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -337,7 +337,9 @@ EXPORT_SYMBOL(boot_option_idle_override);
* Powermanagement idle function, if any..
*/
void (*pm_idle)(void);
+#ifdef CONFIG_APM_MODULE
EXPORT_SYMBOL(pm_idle);
+#endif
#ifdef CONFIG_X86_32
/*
@@ -449,7 +451,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
{
if (!need_resched()) {
- if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR))
+ if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)&current_thread_info()->flags);
__monitor((void *)&current_thread_info()->flags, 0, 0);
@@ -465,7 +467,7 @@ static void mwait_idle(void)
if (!need_resched()) {
trace_power_start(POWER_CSTATE, 1, smp_processor_id());
trace_cpu_idle(1, smp_processor_id());
- if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR))
+ if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)&current_thread_info()->flags);
__monitor((void *)&current_thread_info()->flags, 0, 0);
@@ -535,45 +537,45 @@ int mwait_usable(const struct cpuinfo_x86 *c)
return (edx & MWAIT_EDX_C1);
}
-bool c1e_detected;
-EXPORT_SYMBOL(c1e_detected);
+bool amd_e400_c1e_detected;
+EXPORT_SYMBOL(amd_e400_c1e_detected);
-static cpumask_var_t c1e_mask;
+static cpumask_var_t amd_e400_c1e_mask;
-void c1e_remove_cpu(int cpu)
+void amd_e400_remove_cpu(int cpu)
{
- if (c1e_mask != NULL)
- cpumask_clear_cpu(cpu, c1e_mask);
+ if (amd_e400_c1e_mask != NULL)
+ cpumask_clear_cpu(cpu, amd_e400_c1e_mask);
}
/*
- * C1E aware idle routine. We check for C1E active in the interrupt
+ * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt
* pending message MSR. If we detect C1E, then we handle it the same
* way as C3 power states (local apic timer and TSC stop)
*/
-static void c1e_idle(void)
+static void amd_e400_idle(void)
{
if (need_resched())
return;
- if (!c1e_detected) {
+ if (!amd_e400_c1e_detected) {
u32 lo, hi;
rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
if (lo & K8_INTP_C1E_ACTIVE_MASK) {
- c1e_detected = true;
+ amd_e400_c1e_detected = true;
if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
mark_tsc_unstable("TSC halt in AMD C1E");
printk(KERN_INFO "System has AMD C1E enabled\n");
}
}
- if (c1e_detected) {
+ if (amd_e400_c1e_detected) {
int cpu = smp_processor_id();
- if (!cpumask_test_cpu(cpu, c1e_mask)) {
- cpumask_set_cpu(cpu, c1e_mask);
+ if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) {
+ cpumask_set_cpu(cpu, amd_e400_c1e_mask);
/*
* Force broadcast so ACPI can not interfere.
*/
@@ -616,17 +618,17 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
pm_idle = mwait_idle;
} else if (cpu_has_amd_erratum(amd_erratum_400)) {
/* E400: APIC timer interrupt does not wake up CPU from C1e */
- printk(KERN_INFO "using C1E aware idle routine\n");
- pm_idle = c1e_idle;
+ printk(KERN_INFO "using AMD E400 aware idle routine\n");
+ pm_idle = amd_e400_idle;
} else
pm_idle = default_idle;
}
-void __init init_c1e_mask(void)
+void __init init_amd_e400_c1e_mask(void)
{
- /* If we're using c1e_idle, we need to allocate c1e_mask. */
- if (pm_idle == c1e_idle)
- zalloc_cpumask_var(&c1e_mask, GFP_KERNEL);
+ /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */
+ if (pm_idle == amd_e400_idle)
+ zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL);
}
static int __init idle_setup(char *str)
@@ -640,6 +642,7 @@ static int __init idle_setup(char *str)
boot_option_idle_override = IDLE_POLL;
} else if (!strcmp(str, "mwait")) {
boot_option_idle_override = IDLE_FORCE_MWAIT;
+ WARN_ONCE(1, "\"idle=mwait\" will be removed in 2012\n");
} else if (!strcmp(str, "halt")) {
/*
* When the boot option of idle=halt is added, halt is
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 8d128783af4..a3d0dc59067 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -245,7 +245,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
{
set_user_gs(regs, 0);
regs->fs = 0;
- set_fs(USER_DS);
regs->ds = __USER_DS;
regs->es = __USER_DS;
regs->ss = __USER_DS;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 6c9dd922ac0..ca6f7ab8df3 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -338,7 +338,6 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
regs->cs = _cs;
regs->ss = _ss;
regs->flags = X86_EFLAGS_IF;
- set_fs(USER_DS);
/*
* Free the old FP and other extended state
*/
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 45892dc4b72..82528799c5d 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -528,7 +528,7 @@ static int genregs_set(struct task_struct *target,
return ret;
}
-static void ptrace_triggered(struct perf_event *bp, int nmi,
+static void ptrace_triggered(struct perf_event *bp,
struct perf_sample_data *data,
struct pt_regs *regs)
{
@@ -608,6 +608,9 @@ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
unsigned len, type;
struct perf_event *bp;
+ if (ptrace_get_breakpoints(tsk) < 0)
+ return -ESRCH;
+
data &= ~DR_CONTROL_RESERVED;
old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
restore:
@@ -655,6 +658,9 @@ restore:
}
goto restore;
}
+
+ ptrace_put_breakpoints(tsk);
+
return ((orig_ret < 0) ? orig_ret : rc);
}
@@ -668,10 +674,17 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
if (n < HBP_NUM) {
struct perf_event *bp;
+
+ if (ptrace_get_breakpoints(tsk) < 0)
+ return -ESRCH;
+
bp = thread->ptrace_bps[n];
if (!bp)
- return 0;
- val = bp->hw.info.address;
+ val = 0;
+ else
+ val = bp->hw.info.address;
+
+ ptrace_put_breakpoints(tsk);
} else if (n == 6) {
val = thread->debugreg6;
} else if (n == 7) {
@@ -686,6 +699,10 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
struct perf_event *bp;
struct thread_struct *t = &tsk->thread;
struct perf_event_attr attr;
+ int err = 0;
+
+ if (ptrace_get_breakpoints(tsk) < 0)
+ return -ESRCH;
if (!t->ptrace_bps[nr]) {
ptrace_breakpoint_init(&attr);
@@ -698,7 +715,8 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
attr.bp_type = HW_BREAKPOINT_W;
attr.disabled = 1;
- bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk);
+ bp = register_user_hw_breakpoint(&attr, ptrace_triggered,
+ NULL, tsk);
/*
* CHECKME: the previous code returned -EIO if the addr wasn't
@@ -709,24 +727,23 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
* writing for the user. And anyway this is the previous
* behaviour.
*/
- if (IS_ERR(bp))
- return PTR_ERR(bp);
+ if (IS_ERR(bp)) {
+ err = PTR_ERR(bp);
+ goto put;
+ }
t->ptrace_bps[nr] = bp;
} else {
- int err;
-
bp = t->ptrace_bps[nr];
attr = bp->attr;
attr.bp_addr = addr;
err = modify_user_hw_breakpoint(bp, &attr);
- if (err)
- return err;
}
-
- return 0;
+put:
+ ptrace_put_breakpoints(tsk);
+ return err;
}
/*
@@ -1347,7 +1364,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
* We must return the syscall number to actually look up in the table.
* This can be -1L to skip running any syscall at all.
*/
-asmregparm long syscall_trace_enter(struct pt_regs *regs)
+long syscall_trace_enter(struct pt_regs *regs)
{
long ret = 0;
@@ -1392,7 +1409,7 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
return ret ?: regs->orig_ax;
}
-asmregparm void syscall_trace_leave(struct pt_regs *regs)
+void syscall_trace_leave(struct pt_regs *regs)
{
bool step;
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 8bbe8c56916..b78643d0f9a 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -10,7 +10,7 @@
static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
{
- u8 config, rev;
+ u8 config;
u16 word;
/* BIOS may enable hardware IRQ balancing for
@@ -18,8 +18,7 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
* based platforms.
* Disable SW irqbalance/affinity on those platforms.
*/
- pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
- if (rev > 0x9)
+ if (dev->revision > 0x9)
return;
/* enable access to config space*/
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 08c44b08bf5..9242436e993 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -36,7 +36,7 @@ EXPORT_SYMBOL(pm_power_off);
static const struct desc_ptr no_idt = {};
static int reboot_mode;
-enum reboot_type reboot_type = BOOT_KBD;
+enum reboot_type reboot_type = BOOT_ACPI;
int reboot_force;
#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
@@ -294,6 +294,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_BOARD_NAME, "VersaLogic Menlow board"),
},
},
+ { /* Handle reboot issue on Acer Aspire one */
+ .callback = set_bios_reboot,
+ .ident = "Acer Aspire One A110",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"),
+ },
+ },
{ }
};
@@ -411,6 +419,30 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
},
},
+ { /* Handle problems with rebooting on the Latitude E6320. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Latitude E6320",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6320"),
+ },
+ },
+ { /* Handle problems with rebooting on the Latitude E5420. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Latitude E5420",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5420"),
+ },
+ },
+ { /* Handle problems with rebooting on the Latitude E6420. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Latitude E6420",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"),
+ },
+ },
{ }
};
@@ -478,9 +510,24 @@ void __attribute__((weak)) mach_reboot_fixups(void)
{
}
+/*
+ * Windows compatible x86 hardware expects the following on reboot:
+ *
+ * 1) If the FADT has the ACPI reboot register flag set, try it
+ * 2) If still alive, write to the keyboard controller
+ * 3) If still alive, write to the ACPI reboot register again
+ * 4) If still alive, write to the keyboard controller again
+ *
+ * If the machine is still alive at this stage, it gives up. We default to
+ * following the same pattern, except that if we're still alive after (4) we'll
+ * try to force a triple fault and then cycle between hitting the keyboard
+ * controller and doing that
+ */
static void native_machine_emergency_restart(void)
{
int i;
+ int attempt = 0;
+ int orig_reboot_type = reboot_type;
if (reboot_emergency)
emergency_vmx_disable_all();
@@ -502,6 +549,13 @@ static void native_machine_emergency_restart(void)
outb(0xfe, 0x64); /* pulse reset low */
udelay(50);
}
+ if (attempt == 0 && orig_reboot_type == BOOT_ACPI) {
+ attempt = 1;
+ reboot_type = BOOT_ACPI;
+ } else {
+ reboot_type = BOOT_TRIPLE;
+ }
+ break;
case BOOT_TRIPLE:
load_idt(&no_idt);
diff --git a/arch/x86/kernel/reboot_32.S b/arch/x86/kernel/reboot_32.S
index 29092b38d81..1d5c46df0d7 100644
--- a/arch/x86/kernel/reboot_32.S
+++ b/arch/x86/kernel/reboot_32.S
@@ -21,26 +21,26 @@ r_base = .
/* Get our own relocated address */
call 1f
1: popl %ebx
- subl $1b, %ebx
+ subl $(1b - r_base), %ebx
/* Compute the equivalent real-mode segment */
movl %ebx, %ecx
shrl $4, %ecx
/* Patch post-real-mode segment jump */
- movw dispatch_table(%ebx,%eax,2),%ax
- movw %ax, 101f(%ebx)
- movw %cx, 102f(%ebx)
+ movw (dispatch_table - r_base)(%ebx,%eax,2),%ax
+ movw %ax, (101f - r_base)(%ebx)
+ movw %cx, (102f - r_base)(%ebx)
/* Set up the IDT for real mode. */
- lidtl machine_real_restart_idt(%ebx)
+ lidtl (machine_real_restart_idt - r_base)(%ebx)
/*
* Set up a GDT from which we can load segment descriptors for real
* mode. The GDT is not used in real mode; it is just needed here to
* prepare the descriptors.
*/
- lgdtl machine_real_restart_gdt(%ebx)
+ lgdtl (machine_real_restart_gdt - r_base)(%ebx)
/*
* Load the data segment registers with 16-bit compatible values
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index 41235531b11..36818f8ec2b 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -97,6 +97,8 @@ relocate_kernel:
ret
identity_mapped:
+ /* set return address to 0 if not preserving context */
+ pushl $0
/* store the start address on the stack */
pushl %edx
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 4de8f5b3d47..7a6f3b3be3c 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -100,6 +100,8 @@ relocate_kernel:
ret
identity_mapped:
+ /* set return address to 0 if not preserving context */
+ pushq $0
/* store the start address on the stack */
pushq %rdx
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 4be9b398470..afaf38447ef 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -691,8 +691,6 @@ early_param("reservelow", parse_reservelow);
void __init setup_arch(char **cmdline_p)
{
- unsigned long flags;
-
#ifdef CONFIG_X86_32
memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
visws_early_detect();
@@ -912,6 +910,13 @@ void __init setup_arch(char **cmdline_p)
memblock.current_limit = get_max_mapped();
memblock_x86_fill();
+ /*
+ * The EFI specification says that boot service code won't be called
+ * after ExitBootServices(). This is, in fact, a lie.
+ */
+ if (efi_enabled)
+ efi_reserve_boot_services();
+
/* preallocate 4k for mptable mpc */
early_reserve_e820_mpc_new();
@@ -948,6 +953,8 @@ void __init setup_arch(char **cmdline_p)
if (init_ohci1394_dma_early)
init_ohci1394_dma_on_all_controllers();
#endif
+ /* Allocate bigger log buffer */
+ setup_log_buf(1);
reserve_initrd();
@@ -966,7 +973,6 @@ void __init setup_arch(char **cmdline_p)
initmem_init();
memblock_find_dma_reserve();
- dma32_reserve_bootmem();
#ifdef CONFIG_KVM_CLOCK
kvmclock_init();
@@ -1041,9 +1047,7 @@ void __init setup_arch(char **cmdline_p)
mcheck_init();
- local_irq_save(flags);
- arch_init_ideal_nop5();
- local_irq_restore(flags);
+ arch_init_ideal_nops();
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 4fd173cd8e5..54ddaeb221c 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -485,17 +485,18 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
asmlinkage int
sys_sigsuspend(int history0, int history1, old_sigset_t mask)
{
- mask &= _BLOCKABLE;
- spin_lock_irq(&current->sighand->siglock);
+ sigset_t blocked;
+
current->saved_sigmask = current->blocked;
- siginitset(&current->blocked, mask);
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
+
+ mask &= _BLOCKABLE;
+ siginitset(&blocked, mask);
+ set_current_blocked(&blocked);
current->state = TASK_INTERRUPTIBLE;
schedule();
- set_restore_sigmask();
+ set_restore_sigmask();
return -ERESTARTNOHAND;
}
@@ -572,10 +573,7 @@ unsigned long sys_sigreturn(struct pt_regs *regs)
goto badframe;
sigdelsetmask(&set, ~_BLOCKABLE);
- spin_lock_irq(&current->sighand->siglock);
- current->blocked = set;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
+ set_current_blocked(&set);
if (restore_sigcontext(regs, &frame->sc, &ax))
goto badframe;
@@ -601,10 +599,7 @@ long sys_rt_sigreturn(struct pt_regs *regs)
goto badframe;
sigdelsetmask(&set, ~_BLOCKABLE);
- spin_lock_irq(&current->sighand->siglock);
- current->blocked = set;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
+ set_current_blocked(&set);
if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
goto badframe;
@@ -656,11 +651,15 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
static int
setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
- sigset_t *set, struct pt_regs *regs)
+ struct pt_regs *regs)
{
int usig = signr_convert(sig);
+ sigset_t *set = &current->blocked;
int ret;
+ if (current_thread_info()->status & TS_RESTORE_SIGMASK)
+ set = &current->saved_sigmask;
+
/* Set up the stack frame */
if (is_ia32) {
if (ka->sa.sa_flags & SA_SIGINFO)
@@ -675,13 +674,15 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
return -EFAULT;
}
+ current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
return ret;
}
static int
handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
- sigset_t *oldset, struct pt_regs *regs)
+ struct pt_regs *regs)
{
+ sigset_t blocked;
int ret;
/* Are we from a system call? */
@@ -714,20 +715,11 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
regs->flags &= ~X86_EFLAGS_TF;
- ret = setup_rt_frame(sig, ka, info, oldset, regs);
+ ret = setup_rt_frame(sig, ka, info, regs);
if (ret)
return ret;
-#ifdef CONFIG_X86_64
- /*
- * This has nothing to do with segment registers,
- * despite the name. This magic affects uaccess.h
- * macros' behavior. Reset it to the normal setting.
- */
- set_fs(USER_DS);
-#endif
-
/*
* Clear the direction flag as per the ABI for function entry.
*/
@@ -741,12 +733,10 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
*/
regs->flags &= ~X86_EFLAGS_TF;
- spin_lock_irq(&current->sighand->siglock);
- sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
+ sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
if (!(ka->sa.sa_flags & SA_NODEFER))
- sigaddset(&current->blocked, sig);
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
+ sigaddset(&blocked, sig);
+ set_current_blocked(&blocked);
tracehook_signal_handler(sig, info, ka, regs,
test_thread_flag(TIF_SINGLESTEP));
@@ -771,7 +761,6 @@ static void do_signal(struct pt_regs *regs)
struct k_sigaction ka;
siginfo_t info;
int signr;
- sigset_t *oldset;
/*
* We want the common case to go fast, which is why we may in certain
@@ -783,23 +772,10 @@ static void do_signal(struct pt_regs *regs)
if (!user_mode(regs))
return;
- if (current_thread_info()->status & TS_RESTORE_SIGMASK)
- oldset = &current->saved_sigmask;
- else
- oldset = &current->blocked;
-
signr = get_signal_to_deliver(&info, &ka, regs, NULL);
if (signr > 0) {
/* Whee! Actually deliver the signal. */
- if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
- /*
- * A signal was successfully delivered; the saved
- * sigmask will have been stored in the signal frame,
- * and will be restored by sigreturn, so we can simply
- * clear the TS_RESTORE_SIGMASK flag.
- */
- current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
- }
+ handle_signal(signr, &info, &ka, regs);
return;
}
@@ -827,7 +803,7 @@ static void do_signal(struct pt_regs *regs)
*/
if (current_thread_info()->status & TS_RESTORE_SIGMASK) {
current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
- sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
+ set_current_blocked(&current->saved_sigmask);
}
}
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 513deac7228..013e7eba83b 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -194,14 +194,13 @@ static void native_stop_other_cpus(int wait)
}
/*
- * Reschedule call back. Nothing to do,
- * all the work is done automatically when
- * we return from the interrupt.
+ * Reschedule call back.
*/
void smp_reschedule_interrupt(struct pt_regs *regs)
{
ack_APIC_irq();
inc_irq_stat(irq_resched_count);
+ scheduler_ipi();
/*
* KVM uses this interrupt to force a cpu out of guest mode
*/
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 8ed8908cc9f..9f548cb4a95 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -285,6 +285,19 @@ notrace static void __cpuinit start_secondary(void *unused)
per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
x86_platform.nmi_init();
+ /*
+ * Wait until the cpu which brought this one up marked it
+ * online before enabling interrupts. If we don't do that then
+ * we can end up waking up the softirq thread before this cpu
+ * reached the active state, which makes the scheduler unhappy
+ * and schedule the softirq thread on the wrong cpu. This is
+ * only observable with forced threaded interrupts, but in
+ * theory it could also happen w/o them. It's just way harder
+ * to achieve.
+ */
+ while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask))
+ cpu_relax();
+
/* enable local interrupts */
local_irq_enable();
@@ -312,26 +325,6 @@ void __cpuinit smp_store_cpu_info(int id)
identify_secondary_cpu(c);
}
-static void __cpuinit check_cpu_siblings_on_same_node(int cpu1, int cpu2)
-{
- int node1 = early_cpu_to_node(cpu1);
- int node2 = early_cpu_to_node(cpu2);
-
- /*
- * Our CPU scheduler assumes all logical cpus in the same physical cpu
- * share the same node. But, buggy ACPI or NUMA emulation might assign
- * them to different node. Fix it.
- */
- if (node1 != node2) {
- pr_warning("CPU %d in node %d and CPU %d in node %d are in the same physical CPU. forcing same node %d\n",
- cpu1, node1, cpu2, node2, node2);
-
- numa_remove_cpu(cpu1);
- numa_set_node(cpu1, node2);
- numa_add_cpu(cpu1);
- }
-}
-
static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
{
cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2));
@@ -340,7 +333,6 @@ static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
cpumask_set_cpu(cpu2, cpu_core_mask(cpu1));
cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2));
cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1));
- check_cpu_siblings_on_same_node(cpu1, cpu2);
}
@@ -382,12 +374,10 @@ void __cpuinit set_cpu_sibling_map(int cpu)
per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
cpumask_set_cpu(i, cpu_llc_shared_mask(cpu));
cpumask_set_cpu(cpu, cpu_llc_shared_mask(i));
- check_cpu_siblings_on_same_node(cpu, i);
}
if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
cpumask_set_cpu(i, cpu_core_mask(cpu));
cpumask_set_cpu(cpu, cpu_core_mask(i));
- check_cpu_siblings_on_same_node(cpu, i);
/*
* Does this new cpu bringup a new core?
*/
@@ -448,7 +438,7 @@ static void impress_friends(void)
void __inquire_remote_apic(int apicid)
{
unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
- char *names[] = { "ID", "VERSION", "SPIV" };
+ const char * const names[] = { "ID", "VERSION", "SPIV" };
int timeout;
u32 status;
@@ -1330,7 +1320,7 @@ void play_dead_common(void)
{
idle_task_exit();
reset_lazy_tlbstate();
- c1e_remove_cpu(raw_smp_processor_id());
+ amd_e400_remove_cpu(raw_smp_processor_id());
mb();
/* Ack it */
@@ -1355,9 +1345,9 @@ static inline void mwait_play_dead(void)
void *mwait_ptr;
struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info);
- if (!(cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)))
+ if (!(this_cpu_has(X86_FEATURE_MWAIT) && mwait_usable(c)))
return;
- if (!cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLSH))
+ if (!this_cpu_has(X86_FEATURE_CLFLSH))
return;
if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
return;
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 6515733a289..fdd0c6430e5 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -9,15 +9,6 @@
#include <linux/uaccess.h>
#include <asm/stacktrace.h>
-static void save_stack_warning(void *data, char *msg)
-{
-}
-
-static void
-save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
-}
-
static int save_stack_stack(void *data, char *name)
{
return 0;
@@ -53,16 +44,12 @@ save_stack_address_nosched(void *data, unsigned long addr, int reliable)
}
static const struct stacktrace_ops save_stack_ops = {
- .warning = save_stack_warning,
- .warning_symbol = save_stack_warning_symbol,
.stack = save_stack_stack,
.address = save_stack_address,
.walk_stack = print_context_stack,
};
static const struct stacktrace_ops save_stack_ops_nosched = {
- .warning = save_stack_warning,
- .warning_symbol = save_stack_warning_symbol,
.stack = save_stack_stack,
.address = save_stack_address_nosched,
.walk_stack = print_context_stack,
@@ -79,7 +66,7 @@ void save_stack_trace(struct stack_trace *trace)
}
EXPORT_SYMBOL_GPL(save_stack_trace);
-void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs)
+void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
{
dump_trace(current, regs, NULL, 0, &save_stack_ops, trace);
if (trace->nr_entries < trace->max_entries)
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index abce34d5c79..fbb0a045a1a 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -344,3 +344,5 @@ ENTRY(sys_call_table)
.long sys_open_by_handle_at
.long sys_clock_adjtime
.long sys_syncfs
+ .long sys_sendmmsg /* 345 */
+ .long sys_setns
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 998e972f3b1..e07a2fc876b 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -36,6 +36,7 @@
#include <asm/bootparam.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
+#include <asm/swiotlb.h>
#include <asm/fixmap.h>
#include <asm/proto.h>
#include <asm/setup.h>
@@ -110,7 +111,6 @@ static struct mm_struct tboot_mm = {
.mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
- .cpu_vm_mask = CPU_MASK_ALL,
};
static inline void switch_to_tboot_pt(void)
diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c
index 787a5e499dd..3f92ce07e52 100644
--- a/arch/x86/kernel/test_nx.c
+++ b/arch/x86/kernel/test_nx.c
@@ -161,7 +161,7 @@ static int test_NX(void)
}
#endif
- return 0;
+ return ret;
}
static void test_exit(void)
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index 25a28a24593..5a64d057be5 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -11,19 +11,19 @@
#include <linux/clockchips.h>
#include <linux/interrupt.h>
+#include <linux/i8253.h>
#include <linux/time.h>
#include <linux/mca.h>
#include <asm/vsyscall.h>
#include <asm/x86_init.h>
#include <asm/i8259.h>
-#include <asm/i8253.h>
#include <asm/timer.h>
#include <asm/hpet.h>
#include <asm/time.h>
#ifdef CONFIG_X86_64
-volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
+DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES;
#endif
unsigned long profile_pc(struct pt_regs *regs)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b9b67166f9d..9682ec50180 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -49,7 +49,7 @@
#include <asm/stacktrace.h>
#include <asm/processor.h>
#include <asm/debugreg.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <asm/system.h>
#include <asm/traps.h>
#include <asm/desc.h>
@@ -872,6 +872,12 @@ void __init trap_init(void)
set_bit(SYSCALL_VECTOR, used_vectors);
#endif
+#ifdef CONFIG_X86_64
+ BUG_ON(test_bit(VSYSCALL_EMU_VECTOR, used_vectors));
+ set_system_intr_gate(VSYSCALL_EMU_VECTOR, &emulate_vsyscall);
+ set_bit(VSYSCALL_EMU_VECTOR, used_vectors);
+#endif
+
/*
* Should be a barrier for any external CPU state:
*/
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 9335bf7dd2e..db483369f10 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -5,7 +5,6 @@
#include <linux/timer.h>
#include <linux/acpi_pmtmr.h>
#include <linux/cpufreq.h>
-#include <linux/dmi.h>
#include <linux/delay.h>
#include <linux/clocksource.h>
#include <linux/percpu.h>
@@ -763,25 +762,6 @@ static cycle_t read_tsc(struct clocksource *cs)
ret : clocksource_tsc.cycle_last;
}
-#ifdef CONFIG_X86_64
-static cycle_t __vsyscall_fn vread_tsc(void)
-{
- cycle_t ret;
-
- /*
- * Surround the RDTSC by barriers, to make sure it's not
- * speculated to outside the seqlock critical section and
- * does not cause time warps:
- */
- rdtsc_barrier();
- ret = (cycle_t)vget_cycles();
- rdtsc_barrier();
-
- return ret >= __vsyscall_gtod_data.clock.cycle_last ?
- ret : __vsyscall_gtod_data.clock.cycle_last;
-}
-#endif
-
static void resume_tsc(struct clocksource *cs)
{
clocksource_tsc.cycle_last = 0;
@@ -796,7 +776,7 @@ static struct clocksource clocksource_tsc = {
.flags = CLOCK_SOURCE_IS_CONTINUOUS |
CLOCK_SOURCE_MUST_VERIFY,
#ifdef CONFIG_X86_64
- .vread = vread_tsc,
+ .archdata = { .vclock_mode = VCLOCK_TSC },
#endif
};
@@ -819,27 +799,6 @@ void mark_tsc_unstable(char *reason)
EXPORT_SYMBOL_GPL(mark_tsc_unstable);
-static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d)
-{
- printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
- d->ident);
- tsc_unstable = 1;
- return 0;
-}
-
-/* List of systems that have known TSC problems */
-static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
- {
- .callback = dmi_mark_tsc_unstable,
- .ident = "IBM Thinkpad 380XD",
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
- DMI_MATCH(DMI_BOARD_NAME, "2635FA0"),
- },
- },
- {}
-};
-
static void __init check_system_tsc_reliable(void)
{
#ifdef CONFIG_MGEODE_LX
@@ -1029,8 +988,6 @@ void __init tsc_init(void)
lpj_fine = lpj;
use_tsc_delay();
- /* Check and install the TSC clocksource */
- dmi_check_system(bad_tsc_dmi_table);
if (unsynchronized_tsc())
mark_tsc_unstable("TSCs unsynchronized");
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 624a2016198..4aa9c54a9b7 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -166,50 +166,18 @@ SECTIONS
__vsyscall_0 = .;
. = VSYSCALL_ADDR;
- .vsyscall_0 : AT(VLOAD(.vsyscall_0)) {
+ .vsyscall : AT(VLOAD(.vsyscall)) {
*(.vsyscall_0)
- } :user
- . = ALIGN(L1_CACHE_BYTES);
- .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
- *(.vsyscall_fn)
- }
-
- . = ALIGN(L1_CACHE_BYTES);
- .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
- *(.vsyscall_gtod_data)
- }
-
- vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
- .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) {
- *(.vsyscall_clock)
- }
- vsyscall_clock = VVIRT(.vsyscall_clock);
-
-
- .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) {
+ . = 1024;
*(.vsyscall_1)
- }
- .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) {
- *(.vsyscall_2)
- }
- .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) {
- *(.vgetcpu_mode)
- }
- vgetcpu_mode = VVIRT(.vgetcpu_mode);
-
- . = ALIGN(L1_CACHE_BYTES);
- .jiffies : AT(VLOAD(.jiffies)) {
- *(.jiffies)
- }
- jiffies = VVIRT(.jiffies);
-
- .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) {
- *(.vsyscall_3)
- }
+ . = 2048;
+ *(.vsyscall_2)
- . = __vsyscall_0 + PAGE_SIZE;
+ . = 4096; /* Pad the whole page. */
+ } :user =0xcc
+ . = ALIGN(__vsyscall_0 + PAGE_SIZE, PAGE_SIZE);
#undef VSYSCALL_ADDR
#undef VLOAD_OFFSET
@@ -217,6 +185,23 @@ SECTIONS
#undef VVIRT_OFFSET
#undef VVIRT
+ __vvar_page = .;
+
+ .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) {
+
+ /* Place all vvars at the offsets in asm/vvar.h. */
+#define EMIT_VVAR(name, offset) \
+ . = offset; \
+ *(.vvar_ ## name)
+#define __VVAR_KERNEL_LDS
+#include <asm/vvar.h>
+#undef __VVAR_KERNEL_LDS
+#undef EMIT_VVAR
+
+ } :data
+
+ . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE);
+
#endif /* CONFIG_X86_64 */
/* Init code and data - will be freed after init */
@@ -306,6 +291,13 @@ SECTIONS
}
. = ALIGN(8);
+ .apicdrivers : AT(ADDR(.apicdrivers) - LOAD_OFFSET) {
+ __apicdrivers = .;
+ *(.apicdrivers);
+ __apicdrivers_end = .;
+ }
+
+ . = ALIGN(8);
/*
* .exit.text is discard at runtime, not link time, to deal with
* references from .altinstructions and .eh_frame
@@ -319,7 +311,7 @@ SECTIONS
}
#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
- PERCPU(INTERNODE_CACHE_BYTES, PAGE_SIZE)
+ PERCPU_SECTION(INTERNODE_CACHE_BYTES)
#endif
. = ALIGN(PAGE_SIZE);
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index dcbb28c4b69..dda7dff9cef 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -2,6 +2,8 @@
* Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
* Copyright 2003 Andi Kleen, SuSE Labs.
*
+ * [ NOTE: this mechanism is now deprecated in favor of the vDSO. ]
+ *
* Thanks to hpa@transmeta.com for some useful hint.
* Special thanks to Ingo Molnar for his early experience with
* a different vsyscall implementation for Linux/IA32 and for the name.
@@ -11,10 +13,9 @@
* vsyscalls. One vsyscall can reserve more than 1 slot to avoid
* jumping out of line if necessary. We cannot add more with this
* mechanism because older kernels won't return -ENOSYS.
- * If we want more than four we need a vDSO.
*
- * Note: the concept clashes with user mode linux. If you use UML and
- * want per guest time just set the kernel.vsyscall64 sysctl to 0.
+ * Note: the concept clashes with user mode linux. UML users should
+ * use the vDSO.
*/
/* Disable profiling for userspace code: */
@@ -32,9 +33,12 @@
#include <linux/cpu.h>
#include <linux/smp.h>
#include <linux/notifier.h>
+#include <linux/syscalls.h>
+#include <linux/ratelimit.h>
#include <asm/vsyscall.h>
#include <asm/pgtable.h>
+#include <asm/compat.h>
#include <asm/page.h>
#include <asm/unistd.h>
#include <asm/fixmap.h>
@@ -44,23 +48,12 @@
#include <asm/desc.h>
#include <asm/topology.h>
#include <asm/vgtod.h>
+#include <asm/traps.h>
-#define __vsyscall(nr) \
- __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
-#define __syscall_clobber "r11","cx","memory"
-
-/*
- * vsyscall_gtod_data contains data that is :
- * - readonly from vsyscalls
- * - written by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
- * Try to keep this structure as small as possible to avoid cache line ping pongs
- */
-int __vgetcpu_mode __section_vgetcpu_mode;
-
-struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
+DEFINE_VVAR(int, vgetcpu_mode);
+DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
{
- .lock = SEQLOCK_UNLOCKED,
- .sysctl_enabled = 1,
+ .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
};
void update_vsyscall_tz(void)
@@ -79,178 +72,149 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
unsigned long flags;
write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
+
/* copy vsyscall data */
- vsyscall_gtod_data.clock.vread = clock->vread;
- vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
- vsyscall_gtod_data.clock.mask = clock->mask;
- vsyscall_gtod_data.clock.mult = mult;
- vsyscall_gtod_data.clock.shift = clock->shift;
- vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
- vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
- vsyscall_gtod_data.wall_to_monotonic = *wtm;
- vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
+ vsyscall_gtod_data.clock.vclock_mode = clock->archdata.vclock_mode;
+ vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
+ vsyscall_gtod_data.clock.mask = clock->mask;
+ vsyscall_gtod_data.clock.mult = mult;
+ vsyscall_gtod_data.clock.shift = clock->shift;
+ vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
+ vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
+ vsyscall_gtod_data.wall_to_monotonic = *wtm;
+ vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
+
write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
}
-/* RED-PEN may want to readd seq locking, but then the variable should be
- * write-once.
- */
-static __always_inline void do_get_tz(struct timezone * tz)
+static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
+ const char *message)
{
- *tz = __vsyscall_gtod_data.sys_tz;
-}
+ static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
+ struct task_struct *tsk;
-static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
-{
- int ret;
- asm volatile("syscall"
- : "=a" (ret)
- : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
- : __syscall_clobber );
- return ret;
-}
+ if (!show_unhandled_signals || !__ratelimit(&rs))
+ return;
-static __always_inline long time_syscall(long *t)
-{
- long secs;
- asm volatile("syscall"
- : "=a" (secs)
- : "0" (__NR_time),"D" (t) : __syscall_clobber);
- return secs;
-}
+ tsk = current;
-static __always_inline void do_vgettimeofday(struct timeval * tv)
-{
- cycle_t now, base, mask, cycle_delta;
- unsigned seq;
- unsigned long mult, shift, nsec;
- cycle_t (*vread)(void);
- do {
- seq = read_seqbegin(&__vsyscall_gtod_data.lock);
-
- vread = __vsyscall_gtod_data.clock.vread;
- if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) {
- gettimeofday(tv,NULL);
- return;
- }
-
- now = vread();
- base = __vsyscall_gtod_data.clock.cycle_last;
- mask = __vsyscall_gtod_data.clock.mask;
- mult = __vsyscall_gtod_data.clock.mult;
- shift = __vsyscall_gtod_data.clock.shift;
-
- tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
- nsec = __vsyscall_gtod_data.wall_time_nsec;
- } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
-
- /* calculate interval: */
- cycle_delta = (now - base) & mask;
- /* convert to nsecs: */
- nsec += (cycle_delta * mult) >> shift;
-
- while (nsec >= NSEC_PER_SEC) {
- tv->tv_sec += 1;
- nsec -= NSEC_PER_SEC;
- }
- tv->tv_usec = nsec / NSEC_PER_USEC;
+ printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
+ level, tsk->comm, task_pid_nr(tsk),
+ message, regs->ip - 2, regs->cs,
+ regs->sp, regs->ax, regs->si, regs->di);
}
-int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
+static int addr_to_vsyscall_nr(unsigned long addr)
{
- if (tv)
- do_vgettimeofday(tv);
- if (tz)
- do_get_tz(tz);
- return 0;
-}
+ int nr;
-/* This will break when the xtime seconds get inaccurate, but that is
- * unlikely */
-time_t __vsyscall(1) vtime(time_t *t)
-{
- unsigned seq;
- time_t result;
- if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
- return time_syscall(t);
+ if ((addr & ~0xC00UL) != VSYSCALL_START)
+ return -EINVAL;
- do {
- seq = read_seqbegin(&__vsyscall_gtod_data.lock);
+ nr = (addr & 0xC00UL) >> 10;
+ if (nr >= 3)
+ return -EINVAL;
- result = __vsyscall_gtod_data.wall_time_sec;
+ return nr;
+}
- } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
+void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code)
+{
+ struct task_struct *tsk;
+ unsigned long caller;
+ int vsyscall_nr;
+ long ret;
+
+ local_irq_enable();
+
+ /*
+ * Real 64-bit user mode code has cs == __USER_CS. Anything else
+ * is bogus.
+ */
+ if (regs->cs != __USER_CS) {
+ /*
+ * If we trapped from kernel mode, we might as well OOPS now
+ * instead of returning to some random address and OOPSing
+ * then.
+ */
+ BUG_ON(!user_mode(regs));
+
+ /* Compat mode and non-compat 32-bit CS should both segfault. */
+ warn_bad_vsyscall(KERN_WARNING, regs,
+ "illegal int 0xcc from 32-bit mode");
+ goto sigsegv;
+ }
- if (t)
- *t = result;
- return result;
-}
+ /*
+ * x86-ism here: regs->ip points to the instruction after the int 0xcc,
+ * and int 0xcc is two bytes long.
+ */
+ vsyscall_nr = addr_to_vsyscall_nr(regs->ip - 2);
+ if (vsyscall_nr < 0) {
+ warn_bad_vsyscall(KERN_WARNING, regs,
+ "illegal int 0xcc (exploit attempt?)");
+ goto sigsegv;
+ }
-/* Fast way to get current CPU and node.
- This helps to do per node and per CPU caches in user space.
- The result is not guaranteed without CPU affinity, but usually
- works out because the scheduler tries to keep a thread on the same
- CPU.
+ if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
+ warn_bad_vsyscall(KERN_WARNING, regs, "int 0xcc with bad stack (exploit attempt?)");
+ goto sigsegv;
+ }
- tcache must point to a two element sized long array.
- All arguments can be NULL. */
-long __vsyscall(2)
-vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
-{
- unsigned int p;
- unsigned long j = 0;
-
- /* Fast cache - only recompute value once per jiffies and avoid
- relatively costly rdtscp/cpuid otherwise.
- This works because the scheduler usually keeps the process
- on the same CPU and this syscall doesn't guarantee its
- results anyways.
- We do this here because otherwise user space would do it on
- its own in a likely inferior way (no access to jiffies).
- If you don't like it pass NULL. */
- if (tcache && tcache->blob[0] == (j = __jiffies)) {
- p = tcache->blob[1];
- } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
- /* Load per CPU data from RDTSCP */
- native_read_tscp(&p);
- } else {
- /* Load per CPU data from GDT */
- asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
+ tsk = current;
+ if (seccomp_mode(&tsk->seccomp))
+ do_exit(SIGKILL);
+
+ switch (vsyscall_nr) {
+ case 0:
+ ret = sys_gettimeofday(
+ (struct timeval __user *)regs->di,
+ (struct timezone __user *)regs->si);
+ break;
+
+ case 1:
+ ret = sys_time((time_t __user *)regs->di);
+ break;
+
+ case 2:
+ ret = sys_getcpu((unsigned __user *)regs->di,
+ (unsigned __user *)regs->si,
+ 0);
+ break;
}
- if (tcache) {
- tcache->blob[0] = j;
- tcache->blob[1] = p;
+
+ if (ret == -EFAULT) {
+ /*
+ * Bad news -- userspace fed a bad pointer to a vsyscall.
+ *
+ * With a real vsyscall, that would have caused SIGSEGV.
+ * To make writing reliable exploits using the emulated
+ * vsyscalls harder, generate SIGSEGV here as well.
+ */
+ warn_bad_vsyscall(KERN_INFO, regs,
+ "vsyscall fault (exploit attempt?)");
+ goto sigsegv;
}
- if (cpu)
- *cpu = p & 0xfff;
- if (node)
- *node = p >> 12;
- return 0;
-}
-static long __vsyscall(3) venosys_1(void)
-{
- return -ENOSYS;
-}
+ regs->ax = ret;
-#ifdef CONFIG_SYSCTL
-static ctl_table kernel_table2[] = {
- { .procname = "vsyscall64",
- .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec },
- {}
-};
+ /* Emulate a ret instruction. */
+ regs->ip = caller;
+ regs->sp += 8;
-static ctl_table kernel_root_table2[] = {
- { .procname = "kernel", .mode = 0555,
- .child = kernel_table2 },
- {}
-};
-#endif
+ local_irq_disable();
+ return;
-/* Assume __initcall executes before all user space. Hopefully kmod
- doesn't violate that. We'll find out if it does. */
+sigsegv:
+ regs->ip -= 2; /* The faulting instruction should be the int 0xcc. */
+ force_sig(SIGSEGV, current);
+ local_irq_disable();
+}
+
+/*
+ * Assume __initcall executes before all user space. Hopefully kmod
+ * doesn't violate that. We'll find out if it does.
+ */
static void __cpuinit vsyscall_set_cpu(int cpu)
{
unsigned long d;
@@ -261,13 +225,15 @@ static void __cpuinit vsyscall_set_cpu(int cpu)
if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
write_rdtscp_aux((node << 12) | cpu);
- /* Store cpu number in limit so that it can be loaded quickly
- in user space in vgetcpu.
- 12 bits for the CPU and 8 bits for the node. */
+ /*
+ * Store cpu number in limit so that it can be loaded quickly
+ * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node)
+ */
d = 0x0f40000000000ULL;
d |= cpu;
d |= (node & 0xf) << 12;
d |= (node >> 4) << 48;
+
write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
}
@@ -281,8 +247,10 @@ static int __cpuinit
cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
{
long cpu = (long)arg;
+
if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
+
return NOTIFY_DONE;
}
@@ -290,25 +258,23 @@ void __init map_vsyscall(void)
{
extern char __vsyscall_0;
unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
+ extern char __vvar_page;
+ unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page);
/* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
__set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
+ __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
+ BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != (unsigned long)VVAR_ADDRESS);
}
static int __init vsyscall_init(void)
{
- BUG_ON(((unsigned long) &vgettimeofday !=
- VSYSCALL_ADDR(__NR_vgettimeofday)));
- BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
- BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
- BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
-#ifdef CONFIG_SYSCTL
- register_sysctl_table(kernel_root_table2);
-#endif
+ BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE));
+
on_each_cpu(cpu_vsyscall_init, NULL, 1);
/* notifier priority > KVM */
hotcpu_notifier(cpu_vsyscall_notifier, 30);
+
return 0;
}
-
__initcall(vsyscall_init);
diff --git a/arch/x86/kernel/vsyscall_emu_64.S b/arch/x86/kernel/vsyscall_emu_64.S
new file mode 100644
index 00000000000..ffa845eae5c
--- /dev/null
+++ b/arch/x86/kernel/vsyscall_emu_64.S
@@ -0,0 +1,27 @@
+/*
+ * vsyscall_emu_64.S: Vsyscall emulation page
+ *
+ * Copyright (c) 2011 Andy Lutomirski
+ *
+ * Subject to the GNU General Public License, version 2
+ */
+
+#include <linux/linkage.h>
+#include <asm/irq_vectors.h>
+
+/* The unused parts of the page are filled with 0xcc by the linker script. */
+
+.section .vsyscall_0, "a"
+ENTRY(vsyscall_0)
+ int $VSYSCALL_EMU_VECTOR
+END(vsyscall_0)
+
+.section .vsyscall_1, "a"
+ENTRY(vsyscall_1)
+ int $VSYSCALL_EMU_VECTOR
+END(vsyscall_1)
+
+.section .vsyscall_2, "a"
+ENTRY(vsyscall_2)
+ int $VSYSCALL_EMU_VECTOR
+END(vsyscall_2)
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index c11514e9128..6f164bd5e14 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -35,7 +35,7 @@ void iommu_shutdown_noop(void) { }
struct x86_init_ops x86_init __initdata = {
.resources = {
- .probe_roms = x86_init_noop,
+ .probe_roms = probe_roms,
.reserve_resources = reserve_standard_io_resources,
.memory_setup = default_machine_specific_memory_setup,
},
@@ -61,6 +61,10 @@ struct x86_init_ops x86_init __initdata = {
.banner = default_banner,
},
+ .mapping = {
+ .pagetable_reserve = native_pagetable_reserve,
+ },
+
.paging = {
.pagetable_setup_start = native_pagetable_setup_start,
.pagetable_setup_done = native_pagetable_setup_done,
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 50f63648ce1..988724b236b 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -31,6 +31,7 @@ config KVM
select KVM_ASYNC_PF
select USER_RETURN_NOTIFIER
select KVM_MMIO
+ select TASK_DELAY_ACCT
---help---
Support hosting fully virtualized guest machines using hardware
virtualization extensions. You will need a fairly recent
@@ -76,6 +77,5 @@ config KVM_MMU_AUDIT
# the virtualization menu.
source drivers/vhost/Kconfig
source drivers/lguest/Kconfig
-source drivers/virtio/Kconfig
endif # VIRTUALIZATION
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0ad47b819a8..6f08bc940fa 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -47,35 +47,42 @@
#define DstDI (5<<1) /* Destination is in ES:(E)DI */
#define DstMem64 (6<<1) /* 64bit memory operand */
#define DstImmUByte (7<<1) /* 8-bit unsigned immediate operand */
-#define DstMask (7<<1)
+#define DstDX (8<<1) /* Destination is in DX register */
+#define DstMask (0xf<<1)
/* Source operand type. */
-#define SrcNone (0<<4) /* No source operand. */
-#define SrcReg (1<<4) /* Register operand. */
-#define SrcMem (2<<4) /* Memory operand. */
-#define SrcMem16 (3<<4) /* Memory operand (16-bit). */
-#define SrcMem32 (4<<4) /* Memory operand (32-bit). */
-#define SrcImm (5<<4) /* Immediate operand. */
-#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */
-#define SrcOne (7<<4) /* Implied '1' */
-#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */
-#define SrcImmU (9<<4) /* Immediate operand, unsigned */
-#define SrcSI (0xa<<4) /* Source is in the DS:RSI */
-#define SrcImmFAddr (0xb<<4) /* Source is immediate far address */
-#define SrcMemFAddr (0xc<<4) /* Source is far address in memory */
-#define SrcAcc (0xd<<4) /* Source Accumulator */
-#define SrcImmU16 (0xe<<4) /* Immediate operand, unsigned, 16 bits */
-#define SrcMask (0xf<<4)
+#define SrcNone (0<<5) /* No source operand. */
+#define SrcReg (1<<5) /* Register operand. */
+#define SrcMem (2<<5) /* Memory operand. */
+#define SrcMem16 (3<<5) /* Memory operand (16-bit). */
+#define SrcMem32 (4<<5) /* Memory operand (32-bit). */
+#define SrcImm (5<<5) /* Immediate operand. */
+#define SrcImmByte (6<<5) /* 8-bit sign-extended immediate operand. */
+#define SrcOne (7<<5) /* Implied '1' */
+#define SrcImmUByte (8<<5) /* 8-bit unsigned immediate operand. */
+#define SrcImmU (9<<5) /* Immediate operand, unsigned */
+#define SrcSI (0xa<<5) /* Source is in the DS:RSI */
+#define SrcImmFAddr (0xb<<5) /* Source is immediate far address */
+#define SrcMemFAddr (0xc<<5) /* Source is far address in memory */
+#define SrcAcc (0xd<<5) /* Source Accumulator */
+#define SrcImmU16 (0xe<<5) /* Immediate operand, unsigned, 16 bits */
+#define SrcDX (0xf<<5) /* Source is in DX register */
+#define SrcMask (0xf<<5)
/* Generic ModRM decode. */
-#define ModRM (1<<8)
+#define ModRM (1<<9)
/* Destination is only written; never read. */
-#define Mov (1<<9)
-#define BitOp (1<<10)
-#define MemAbs (1<<11) /* Memory operand is absolute displacement */
-#define String (1<<12) /* String instruction (rep capable) */
-#define Stack (1<<13) /* Stack instruction (push/pop) */
-#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
-#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
+#define Mov (1<<10)
+#define BitOp (1<<11)
+#define MemAbs (1<<12) /* Memory operand is absolute displacement */
+#define String (1<<13) /* String instruction (rep capable) */
+#define Stack (1<<14) /* Stack instruction (push/pop) */
+#define GroupMask (7<<15) /* Opcode uses one of the group mechanisms */
+#define Group (1<<15) /* Bits 3:5 of modrm byte extend opcode */
+#define GroupDual (2<<15) /* Alternate decoding of mod == 3 */
+#define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */
+#define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */
+#define Sse (1<<18) /* SSE Vector instruction */
/* Misc flags */
+#define Prot (1<<21) /* instruction generates #UD if not in prot-mode */
#define VendorSpecific (1<<22) /* Vendor specific instruction */
#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */
@@ -102,11 +109,14 @@
struct opcode {
u32 flags;
+ u8 intercept;
union {
int (*execute)(struct x86_emulate_ctxt *ctxt);
struct opcode *group;
struct group_dual *gdual;
+ struct gprefix *gprefix;
} u;
+ int (*check_perm)(struct x86_emulate_ctxt *ctxt);
};
struct group_dual {
@@ -114,6 +124,13 @@ struct group_dual {
struct opcode mod3[8];
};
+struct gprefix {
+ struct opcode pfx_no;
+ struct opcode pfx_66;
+ struct opcode pfx_f2;
+ struct opcode pfx_f3;
+};
+
/* EFLAGS bit definitions. */
#define EFLG_ID (1<<21)
#define EFLG_VIP (1<<20)
@@ -248,42 +265,42 @@ struct group_dual {
"w", "r", _LO32, "r", "", "r")
/* Instruction has three operands and one operand is stored in ECX register */
-#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \
- do { \
- unsigned long _tmp; \
- _type _clv = (_cl).val; \
- _type _srcv = (_src).val; \
- _type _dstv = (_dst).val; \
- \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "5", "2") \
- _op _suffix " %4,%1 \n" \
- _POST_EFLAGS("0", "5", "2") \
- : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \
- : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \
- ); \
- \
- (_cl).val = (unsigned long) _clv; \
- (_src).val = (unsigned long) _srcv; \
- (_dst).val = (unsigned long) _dstv; \
+#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \
+ do { \
+ unsigned long _tmp; \
+ _type _clv = (_cl).val; \
+ _type _srcv = (_src).val; \
+ _type _dstv = (_dst).val; \
+ \
+ __asm__ __volatile__ ( \
+ _PRE_EFLAGS("0", "5", "2") \
+ _op _suffix " %4,%1 \n" \
+ _POST_EFLAGS("0", "5", "2") \
+ : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \
+ : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \
+ ); \
+ \
+ (_cl).val = (unsigned long) _clv; \
+ (_src).val = (unsigned long) _srcv; \
+ (_dst).val = (unsigned long) _dstv; \
} while (0)
-#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \
- do { \
- switch ((_dst).bytes) { \
- case 2: \
- __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
- "w", unsigned short); \
- break; \
- case 4: \
- __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
- "l", unsigned int); \
- break; \
- case 8: \
- ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
- "q", unsigned long)); \
- break; \
- } \
+#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \
+ do { \
+ switch ((_dst).bytes) { \
+ case 2: \
+ __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
+ "w", unsigned short); \
+ break; \
+ case 4: \
+ __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
+ "l", unsigned int); \
+ break; \
+ case 8: \
+ ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
+ "q", unsigned long)); \
+ break; \
+ } \
} while (0)
#define __emulate_1op(_op, _dst, _eflags, _suffix) \
@@ -346,13 +363,25 @@ struct group_dual {
} while (0)
/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */
-#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \
- do { \
- switch((_src).bytes) { \
- case 1: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "b"); break; \
- case 2: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "w"); break; \
- case 4: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "l"); break; \
- case 8: ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "q")); break; \
+#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \
+ do { \
+ switch((_src).bytes) { \
+ case 1: \
+ __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \
+ _eflags, "b"); \
+ break; \
+ case 2: \
+ __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \
+ _eflags, "w"); \
+ break; \
+ case 4: \
+ __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \
+ _eflags, "l"); \
+ break; \
+ case 8: \
+ ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \
+ _eflags, "q")); \
+ break; \
} \
} while (0)
@@ -378,93 +407,88 @@ struct group_dual {
} \
} while (0)
-/* Fetch next part of the instruction being emulated. */
-#define insn_fetch(_type, _size, _eip) \
-({ unsigned long _x; \
- rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \
- if (rc != X86EMUL_CONTINUE) \
- goto done; \
- (_eip) += (_size); \
- (_type)_x; \
-})
-
-#define insn_fetch_arr(_arr, _size, _eip) \
-({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \
- if (rc != X86EMUL_CONTINUE) \
- goto done; \
- (_eip) += (_size); \
-})
+static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
+ enum x86_intercept intercept,
+ enum x86_intercept_stage stage)
+{
+ struct x86_instruction_info info = {
+ .intercept = intercept,
+ .rep_prefix = ctxt->rep_prefix,
+ .modrm_mod = ctxt->modrm_mod,
+ .modrm_reg = ctxt->modrm_reg,
+ .modrm_rm = ctxt->modrm_rm,
+ .src_val = ctxt->src.val64,
+ .src_bytes = ctxt->src.bytes,
+ .dst_bytes = ctxt->dst.bytes,
+ .ad_bytes = ctxt->ad_bytes,
+ .next_rip = ctxt->eip,
+ };
+
+ return ctxt->ops->intercept(ctxt, &info, stage);
+}
-static inline unsigned long ad_mask(struct decode_cache *c)
+static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt)
{
- return (1UL << (c->ad_bytes << 3)) - 1;
+ return (1UL << (ctxt->ad_bytes << 3)) - 1;
}
/* Access/update address held in a register, based on addressing mode. */
static inline unsigned long
-address_mask(struct decode_cache *c, unsigned long reg)
+address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg)
{
- if (c->ad_bytes == sizeof(unsigned long))
+ if (ctxt->ad_bytes == sizeof(unsigned long))
return reg;
else
- return reg & ad_mask(c);
+ return reg & ad_mask(ctxt);
}
static inline unsigned long
-register_address(struct decode_cache *c, unsigned long reg)
+register_address(struct x86_emulate_ctxt *ctxt, unsigned long reg)
{
- return address_mask(c, reg);
+ return address_mask(ctxt, reg);
}
static inline void
-register_address_increment(struct decode_cache *c, unsigned long *reg, int inc)
+register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, int inc)
{
- if (c->ad_bytes == sizeof(unsigned long))
+ if (ctxt->ad_bytes == sizeof(unsigned long))
*reg += inc;
else
- *reg = (*reg & ~ad_mask(c)) | ((*reg + inc) & ad_mask(c));
+ *reg = (*reg & ~ad_mask(ctxt)) | ((*reg + inc) & ad_mask(ctxt));
}
-static inline void jmp_rel(struct decode_cache *c, int rel)
+static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
{
- register_address_increment(c, &c->eip, rel);
+ register_address_increment(ctxt, &ctxt->_eip, rel);
}
-static void set_seg_override(struct decode_cache *c, int seg)
+static u32 desc_limit_scaled(struct desc_struct *desc)
{
- c->has_seg_override = true;
- c->seg_override = seg;
+ u32 limit = get_desc_limit(desc);
+
+ return desc->g ? (limit << 12) | 0xfff : limit;
}
-static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops, int seg)
+static void set_seg_override(struct x86_emulate_ctxt *ctxt, int seg)
{
- if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
- return 0;
-
- return ops->get_cached_segment_base(seg, ctxt->vcpu);
+ ctxt->has_seg_override = true;
+ ctxt->seg_override = seg;
}
-static unsigned seg_override(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- struct decode_cache *c)
+static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
{
- if (!c->has_seg_override)
+ if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
return 0;
- return c->seg_override;
+ return ctxt->ops->get_cached_segment_base(ctxt, seg);
}
-static ulong linear(struct x86_emulate_ctxt *ctxt,
- struct segmented_address addr)
+static unsigned seg_override(struct x86_emulate_ctxt *ctxt)
{
- struct decode_cache *c = &ctxt->decode;
- ulong la;
+ if (!ctxt->has_seg_override)
+ return 0;
- la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea;
- if (c->ad_bytes != 8)
- la &= (u32)-1;
- return la;
+ return ctxt->seg_override;
}
static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
@@ -476,11 +500,21 @@ static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
return X86EMUL_PROPAGATE_FAULT;
}
+static int emulate_db(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_exception(ctxt, DB_VECTOR, 0, false);
+}
+
static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
{
return emulate_exception(ctxt, GP_VECTOR, err, true);
}
+static int emulate_ss(struct x86_emulate_ctxt *ctxt, int err)
+{
+ return emulate_exception(ctxt, SS_VECTOR, err, true);
+}
+
static int emulate_ud(struct x86_emulate_ctxt *ctxt)
{
return emulate_exception(ctxt, UD_VECTOR, 0, false);
@@ -496,19 +530,144 @@ static int emulate_de(struct x86_emulate_ctxt *ctxt)
return emulate_exception(ctxt, DE_VECTOR, 0, false);
}
-static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
+static int emulate_nm(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_exception(ctxt, NM_VECTOR, 0, false);
+}
+
+static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg)
+{
+ u16 selector;
+ struct desc_struct desc;
+
+ ctxt->ops->get_segment(ctxt, &selector, &desc, NULL, seg);
+ return selector;
+}
+
+static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector,
+ unsigned seg)
+{
+ u16 dummy;
+ u32 base3;
+ struct desc_struct desc;
+
+ ctxt->ops->get_segment(ctxt, &dummy, &desc, &base3, seg);
+ ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg);
+}
+
+static int __linearize(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ unsigned size, bool write, bool fetch,
+ ulong *linear)
+{
+ struct desc_struct desc;
+ bool usable;
+ ulong la;
+ u32 lim;
+ u16 sel;
+ unsigned cpl, rpl;
+
+ la = seg_base(ctxt, addr.seg) + addr.ea;
+ switch (ctxt->mode) {
+ case X86EMUL_MODE_REAL:
+ break;
+ case X86EMUL_MODE_PROT64:
+ if (((signed long)la << 16) >> 16 != la)
+ return emulate_gp(ctxt, 0);
+ break;
+ default:
+ usable = ctxt->ops->get_segment(ctxt, &sel, &desc, NULL,
+ addr.seg);
+ if (!usable)
+ goto bad;
+ /* code segment or read-only data segment */
+ if (((desc.type & 8) || !(desc.type & 2)) && write)
+ goto bad;
+ /* unreadable code segment */
+ if (!fetch && (desc.type & 8) && !(desc.type & 2))
+ goto bad;
+ lim = desc_limit_scaled(&desc);
+ if ((desc.type & 8) || !(desc.type & 4)) {
+ /* expand-up segment */
+ if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim)
+ goto bad;
+ } else {
+ /* exapand-down segment */
+ if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim)
+ goto bad;
+ lim = desc.d ? 0xffffffff : 0xffff;
+ if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim)
+ goto bad;
+ }
+ cpl = ctxt->ops->cpl(ctxt);
+ rpl = sel & 3;
+ cpl = max(cpl, rpl);
+ if (!(desc.type & 8)) {
+ /* data segment */
+ if (cpl > desc.dpl)
+ goto bad;
+ } else if ((desc.type & 8) && !(desc.type & 4)) {
+ /* nonconforming code segment */
+ if (cpl != desc.dpl)
+ goto bad;
+ } else if ((desc.type & 8) && (desc.type & 4)) {
+ /* conforming code segment */
+ if (cpl < desc.dpl)
+ goto bad;
+ }
+ break;
+ }
+ if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : ctxt->ad_bytes != 8)
+ la &= (u32)-1;
+ *linear = la;
+ return X86EMUL_CONTINUE;
+bad:
+ if (addr.seg == VCPU_SREG_SS)
+ return emulate_ss(ctxt, addr.seg);
+ else
+ return emulate_gp(ctxt, addr.seg);
+}
+
+static int linearize(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ unsigned size, bool write,
+ ulong *linear)
+{
+ return __linearize(ctxt, addr, size, write, false, linear);
+}
+
+
+static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ void *data,
+ unsigned size)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, addr, size, false, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception);
+}
+
+static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt,
unsigned long eip, u8 *dest)
{
- struct fetch_cache *fc = &ctxt->decode.fetch;
+ struct fetch_cache *fc = &ctxt->fetch;
int rc;
int size, cur_size;
if (eip == fc->end) {
+ unsigned long linear;
+ struct segmented_address addr = { .seg=VCPU_SREG_CS, .ea=eip};
cur_size = fc->end - fc->start;
size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip));
- rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size,
- size, ctxt->vcpu, &ctxt->exception);
+ rc = __linearize(ctxt, addr, size, false, true, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rc = ctxt->ops->fetch(ctxt, linear, fc->data + cur_size,
+ size, &ctxt->exception);
if (rc != X86EMUL_CONTINUE)
return rc;
fc->end += size;
@@ -518,7 +677,6 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
}
static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
unsigned long eip, void *dest, unsigned size)
{
int rc;
@@ -527,13 +685,30 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
if (eip + size - ctxt->eip > 15)
return X86EMUL_UNHANDLEABLE;
while (size--) {
- rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
+ rc = do_insn_fetch_byte(ctxt, eip++, dest++);
if (rc != X86EMUL_CONTINUE)
return rc;
}
return X86EMUL_CONTINUE;
}
+/* Fetch next part of the instruction being emulated. */
+#define insn_fetch(_type, _size, _eip) \
+({ unsigned long _x; \
+ rc = do_insn_fetch(ctxt, (_eip), &_x, (_size)); \
+ if (rc != X86EMUL_CONTINUE) \
+ goto done; \
+ (_eip) += (_size); \
+ (_type)_x; \
+})
+
+#define insn_fetch_arr(_arr, _size, _eip) \
+({ rc = do_insn_fetch(ctxt, (_eip), _arr, (_size)); \
+ if (rc != X86EMUL_CONTINUE) \
+ goto done; \
+ (_eip) += (_size); \
+})
+
/*
* Given the 'reg' portion of a ModRM byte, and a register block, return a
* pointer into the block that addresses the relevant register.
@@ -551,7 +726,6 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs,
}
static int read_descriptor(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
struct segmented_address addr,
u16 *size, unsigned long *address, int op_bytes)
{
@@ -560,13 +734,11 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
if (op_bytes == 2)
op_bytes = 3;
*address = 0;
- rc = ops->read_std(linear(ctxt, addr), (unsigned long *)size, 2,
- ctxt->vcpu, &ctxt->exception);
+ rc = segmented_read_std(ctxt, addr, size, 2);
if (rc != X86EMUL_CONTINUE)
return rc;
addr.ea += 2;
- rc = ops->read_std(linear(ctxt, addr), address, op_bytes,
- ctxt->vcpu, &ctxt->exception);
+ rc = segmented_read_std(ctxt, addr, address, op_bytes);
return rc;
}
@@ -623,80 +795,149 @@ static void fetch_register_operand(struct operand *op)
}
}
-static void decode_register_operand(struct operand *op,
- struct decode_cache *c,
+static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg)
+{
+ ctxt->ops->get_fpu(ctxt);
+ switch (reg) {
+ case 0: asm("movdqu %%xmm0, %0" : "=m"(*data)); break;
+ case 1: asm("movdqu %%xmm1, %0" : "=m"(*data)); break;
+ case 2: asm("movdqu %%xmm2, %0" : "=m"(*data)); break;
+ case 3: asm("movdqu %%xmm3, %0" : "=m"(*data)); break;
+ case 4: asm("movdqu %%xmm4, %0" : "=m"(*data)); break;
+ case 5: asm("movdqu %%xmm5, %0" : "=m"(*data)); break;
+ case 6: asm("movdqu %%xmm6, %0" : "=m"(*data)); break;
+ case 7: asm("movdqu %%xmm7, %0" : "=m"(*data)); break;
+#ifdef CONFIG_X86_64
+ case 8: asm("movdqu %%xmm8, %0" : "=m"(*data)); break;
+ case 9: asm("movdqu %%xmm9, %0" : "=m"(*data)); break;
+ case 10: asm("movdqu %%xmm10, %0" : "=m"(*data)); break;
+ case 11: asm("movdqu %%xmm11, %0" : "=m"(*data)); break;
+ case 12: asm("movdqu %%xmm12, %0" : "=m"(*data)); break;
+ case 13: asm("movdqu %%xmm13, %0" : "=m"(*data)); break;
+ case 14: asm("movdqu %%xmm14, %0" : "=m"(*data)); break;
+ case 15: asm("movdqu %%xmm15, %0" : "=m"(*data)); break;
+#endif
+ default: BUG();
+ }
+ ctxt->ops->put_fpu(ctxt);
+}
+
+static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
+ int reg)
+{
+ ctxt->ops->get_fpu(ctxt);
+ switch (reg) {
+ case 0: asm("movdqu %0, %%xmm0" : : "m"(*data)); break;
+ case 1: asm("movdqu %0, %%xmm1" : : "m"(*data)); break;
+ case 2: asm("movdqu %0, %%xmm2" : : "m"(*data)); break;
+ case 3: asm("movdqu %0, %%xmm3" : : "m"(*data)); break;
+ case 4: asm("movdqu %0, %%xmm4" : : "m"(*data)); break;
+ case 5: asm("movdqu %0, %%xmm5" : : "m"(*data)); break;
+ case 6: asm("movdqu %0, %%xmm6" : : "m"(*data)); break;
+ case 7: asm("movdqu %0, %%xmm7" : : "m"(*data)); break;
+#ifdef CONFIG_X86_64
+ case 8: asm("movdqu %0, %%xmm8" : : "m"(*data)); break;
+ case 9: asm("movdqu %0, %%xmm9" : : "m"(*data)); break;
+ case 10: asm("movdqu %0, %%xmm10" : : "m"(*data)); break;
+ case 11: asm("movdqu %0, %%xmm11" : : "m"(*data)); break;
+ case 12: asm("movdqu %0, %%xmm12" : : "m"(*data)); break;
+ case 13: asm("movdqu %0, %%xmm13" : : "m"(*data)); break;
+ case 14: asm("movdqu %0, %%xmm14" : : "m"(*data)); break;
+ case 15: asm("movdqu %0, %%xmm15" : : "m"(*data)); break;
+#endif
+ default: BUG();
+ }
+ ctxt->ops->put_fpu(ctxt);
+}
+
+static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
+ struct operand *op,
int inhibit_bytereg)
{
- unsigned reg = c->modrm_reg;
- int highbyte_regs = c->rex_prefix == 0;
+ unsigned reg = ctxt->modrm_reg;
+ int highbyte_regs = ctxt->rex_prefix == 0;
+
+ if (!(ctxt->d & ModRM))
+ reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3);
+
+ if (ctxt->d & Sse) {
+ op->type = OP_XMM;
+ op->bytes = 16;
+ op->addr.xmm = reg;
+ read_sse_reg(ctxt, &op->vec_val, reg);
+ return;
+ }
- if (!(c->d & ModRM))
- reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
op->type = OP_REG;
- if ((c->d & ByteOp) && !inhibit_bytereg) {
- op->addr.reg = decode_register(reg, c->regs, highbyte_regs);
+ if ((ctxt->d & ByteOp) && !inhibit_bytereg) {
+ op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs);
op->bytes = 1;
} else {
- op->addr.reg = decode_register(reg, c->regs, 0);
- op->bytes = c->op_bytes;
+ op->addr.reg = decode_register(reg, ctxt->regs, 0);
+ op->bytes = ctxt->op_bytes;
}
fetch_register_operand(op);
op->orig_val = op->val;
}
static int decode_modrm(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
struct operand *op)
{
- struct decode_cache *c = &ctxt->decode;
u8 sib;
int index_reg = 0, base_reg = 0, scale;
int rc = X86EMUL_CONTINUE;
ulong modrm_ea = 0;
- if (c->rex_prefix) {
- c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */
- index_reg = (c->rex_prefix & 2) << 2; /* REX.X */
- c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */
+ if (ctxt->rex_prefix) {
+ ctxt->modrm_reg = (ctxt->rex_prefix & 4) << 1; /* REX.R */
+ index_reg = (ctxt->rex_prefix & 2) << 2; /* REX.X */
+ ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */
}
- c->modrm = insn_fetch(u8, 1, c->eip);
- c->modrm_mod |= (c->modrm & 0xc0) >> 6;
- c->modrm_reg |= (c->modrm & 0x38) >> 3;
- c->modrm_rm |= (c->modrm & 0x07);
- c->modrm_seg = VCPU_SREG_DS;
+ ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip);
+ ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6;
+ ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3;
+ ctxt->modrm_rm |= (ctxt->modrm & 0x07);
+ ctxt->modrm_seg = VCPU_SREG_DS;
- if (c->modrm_mod == 3) {
+ if (ctxt->modrm_mod == 3) {
op->type = OP_REG;
- op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- op->addr.reg = decode_register(c->modrm_rm,
- c->regs, c->d & ByteOp);
+ op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ op->addr.reg = decode_register(ctxt->modrm_rm,
+ ctxt->regs, ctxt->d & ByteOp);
+ if (ctxt->d & Sse) {
+ op->type = OP_XMM;
+ op->bytes = 16;
+ op->addr.xmm = ctxt->modrm_rm;
+ read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm);
+ return rc;
+ }
fetch_register_operand(op);
return rc;
}
op->type = OP_MEM;
- if (c->ad_bytes == 2) {
- unsigned bx = c->regs[VCPU_REGS_RBX];
- unsigned bp = c->regs[VCPU_REGS_RBP];
- unsigned si = c->regs[VCPU_REGS_RSI];
- unsigned di = c->regs[VCPU_REGS_RDI];
+ if (ctxt->ad_bytes == 2) {
+ unsigned bx = ctxt->regs[VCPU_REGS_RBX];
+ unsigned bp = ctxt->regs[VCPU_REGS_RBP];
+ unsigned si = ctxt->regs[VCPU_REGS_RSI];
+ unsigned di = ctxt->regs[VCPU_REGS_RDI];
/* 16-bit ModR/M decode. */
- switch (c->modrm_mod) {
+ switch (ctxt->modrm_mod) {
case 0:
- if (c->modrm_rm == 6)
- modrm_ea += insn_fetch(u16, 2, c->eip);
+ if (ctxt->modrm_rm == 6)
+ modrm_ea += insn_fetch(u16, 2, ctxt->_eip);
break;
case 1:
- modrm_ea += insn_fetch(s8, 1, c->eip);
+ modrm_ea += insn_fetch(s8, 1, ctxt->_eip);
break;
case 2:
- modrm_ea += insn_fetch(u16, 2, c->eip);
+ modrm_ea += insn_fetch(u16, 2, ctxt->_eip);
break;
}
- switch (c->modrm_rm) {
+ switch (ctxt->modrm_rm) {
case 0:
modrm_ea += bx + si;
break;
@@ -716,46 +957,46 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
modrm_ea += di;
break;
case 6:
- if (c->modrm_mod != 0)
+ if (ctxt->modrm_mod != 0)
modrm_ea += bp;
break;
case 7:
modrm_ea += bx;
break;
}
- if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
- (c->modrm_rm == 6 && c->modrm_mod != 0))
- c->modrm_seg = VCPU_SREG_SS;
+ if (ctxt->modrm_rm == 2 || ctxt->modrm_rm == 3 ||
+ (ctxt->modrm_rm == 6 && ctxt->modrm_mod != 0))
+ ctxt->modrm_seg = VCPU_SREG_SS;
modrm_ea = (u16)modrm_ea;
} else {
/* 32/64-bit ModR/M decode. */
- if ((c->modrm_rm & 7) == 4) {
- sib = insn_fetch(u8, 1, c->eip);
+ if ((ctxt->modrm_rm & 7) == 4) {
+ sib = insn_fetch(u8, 1, ctxt->_eip);
index_reg |= (sib >> 3) & 7;
base_reg |= sib & 7;
scale = sib >> 6;
- if ((base_reg & 7) == 5 && c->modrm_mod == 0)
- modrm_ea += insn_fetch(s32, 4, c->eip);
+ if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0)
+ modrm_ea += insn_fetch(s32, 4, ctxt->_eip);
else
- modrm_ea += c->regs[base_reg];
+ modrm_ea += ctxt->regs[base_reg];
if (index_reg != 4)
- modrm_ea += c->regs[index_reg] << scale;
- } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) {
+ modrm_ea += ctxt->regs[index_reg] << scale;
+ } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) {
if (ctxt->mode == X86EMUL_MODE_PROT64)
- c->rip_relative = 1;
+ ctxt->rip_relative = 1;
} else
- modrm_ea += c->regs[c->modrm_rm];
- switch (c->modrm_mod) {
+ modrm_ea += ctxt->regs[ctxt->modrm_rm];
+ switch (ctxt->modrm_mod) {
case 0:
- if (c->modrm_rm == 5)
- modrm_ea += insn_fetch(s32, 4, c->eip);
+ if (ctxt->modrm_rm == 5)
+ modrm_ea += insn_fetch(s32, 4, ctxt->_eip);
break;
case 1:
- modrm_ea += insn_fetch(s8, 1, c->eip);
+ modrm_ea += insn_fetch(s8, 1, ctxt->_eip);
break;
case 2:
- modrm_ea += insn_fetch(s32, 4, c->eip);
+ modrm_ea += insn_fetch(s32, 4, ctxt->_eip);
break;
}
}
@@ -765,53 +1006,50 @@ done:
}
static int decode_abs(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
struct operand *op)
{
- struct decode_cache *c = &ctxt->decode;
int rc = X86EMUL_CONTINUE;
op->type = OP_MEM;
- switch (c->ad_bytes) {
+ switch (ctxt->ad_bytes) {
case 2:
- op->addr.mem.ea = insn_fetch(u16, 2, c->eip);
+ op->addr.mem.ea = insn_fetch(u16, 2, ctxt->_eip);
break;
case 4:
- op->addr.mem.ea = insn_fetch(u32, 4, c->eip);
+ op->addr.mem.ea = insn_fetch(u32, 4, ctxt->_eip);
break;
case 8:
- op->addr.mem.ea = insn_fetch(u64, 8, c->eip);
+ op->addr.mem.ea = insn_fetch(u64, 8, ctxt->_eip);
break;
}
done:
return rc;
}
-static void fetch_bit_operand(struct decode_cache *c)
+static void fetch_bit_operand(struct x86_emulate_ctxt *ctxt)
{
long sv = 0, mask;
- if (c->dst.type == OP_MEM && c->src.type == OP_REG) {
- mask = ~(c->dst.bytes * 8 - 1);
+ if (ctxt->dst.type == OP_MEM && ctxt->src.type == OP_REG) {
+ mask = ~(ctxt->dst.bytes * 8 - 1);
- if (c->src.bytes == 2)
- sv = (s16)c->src.val & (s16)mask;
- else if (c->src.bytes == 4)
- sv = (s32)c->src.val & (s32)mask;
+ if (ctxt->src.bytes == 2)
+ sv = (s16)ctxt->src.val & (s16)mask;
+ else if (ctxt->src.bytes == 4)
+ sv = (s32)ctxt->src.val & (s32)mask;
- c->dst.addr.mem.ea += (sv >> 3);
+ ctxt->dst.addr.mem.ea += (sv >> 3);
}
/* only subword offset */
- c->src.val &= (c->dst.bytes << 3) - 1;
+ ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
}
static int read_emulated(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
unsigned long addr, void *dest, unsigned size)
{
int rc;
- struct read_cache *mc = &ctxt->decode.mem_read;
+ struct read_cache *mc = &ctxt->mem_read;
while (size) {
int n = min(size, 8u);
@@ -819,8 +1057,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
if (mc->pos < mc->end)
goto read_cached;
- rc = ops->read_emulated(addr, mc->data + mc->end, n,
- &ctxt->exception, ctxt->vcpu);
+ rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, n,
+ &ctxt->exception);
if (rc != X86EMUL_CONTINUE)
return rc;
mc->end += n;
@@ -834,27 +1072,69 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
return X86EMUL_CONTINUE;
}
+static int segmented_read(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ void *data,
+ unsigned size)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, addr, size, false, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return read_emulated(ctxt, linear, data, size);
+}
+
+static int segmented_write(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ const void *data,
+ unsigned size)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, addr, size, true, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return ctxt->ops->write_emulated(ctxt, linear, data, size,
+ &ctxt->exception);
+}
+
+static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ const void *orig_data, const void *data,
+ unsigned size)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, addr, size, true, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return ctxt->ops->cmpxchg_emulated(ctxt, linear, orig_data, data,
+ size, &ctxt->exception);
+}
+
static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
unsigned int size, unsigned short port,
void *dest)
{
- struct read_cache *rc = &ctxt->decode.io_read;
+ struct read_cache *rc = &ctxt->io_read;
if (rc->pos == rc->end) { /* refill pio read ahead */
- struct decode_cache *c = &ctxt->decode;
unsigned int in_page, n;
- unsigned int count = c->rep_prefix ?
- address_mask(c, c->regs[VCPU_REGS_RCX]) : 1;
+ unsigned int count = ctxt->rep_prefix ?
+ address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) : 1;
in_page = (ctxt->eflags & EFLG_DF) ?
- offset_in_page(c->regs[VCPU_REGS_RDI]) :
- PAGE_SIZE - offset_in_page(c->regs[VCPU_REGS_RDI]);
+ offset_in_page(ctxt->regs[VCPU_REGS_RDI]) :
+ PAGE_SIZE - offset_in_page(ctxt->regs[VCPU_REGS_RDI]);
n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size,
count);
if (n == 0)
n = 1;
rc->pos = rc->end = 0;
- if (!ops->pio_in_emulated(size, port, rc->data, n, ctxt->vcpu))
+ if (!ctxt->ops->pio_in_emulated(ctxt, size, port, rc->data, n))
return 0;
rc->end = n * size;
}
@@ -864,76 +1144,63 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
return 1;
}
-static u32 desc_limit_scaled(struct desc_struct *desc)
-{
- u32 limit = get_desc_limit(desc);
-
- return desc->g ? (limit << 12) | 0xfff : limit;
-}
-
static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
u16 selector, struct desc_ptr *dt)
{
+ struct x86_emulate_ops *ops = ctxt->ops;
+
if (selector & 1 << 2) {
struct desc_struct desc;
+ u16 sel;
+
memset (dt, 0, sizeof *dt);
- if (!ops->get_cached_descriptor(&desc, NULL, VCPU_SREG_LDTR,
- ctxt->vcpu))
+ if (!ops->get_segment(ctxt, &sel, &desc, NULL, VCPU_SREG_LDTR))
return;
dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
dt->address = get_desc_base(&desc);
} else
- ops->get_gdt(dt, ctxt->vcpu);
+ ops->get_gdt(ctxt, dt);
}
/* allowed just for 8 bytes segments */
static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
u16 selector, struct desc_struct *desc)
{
struct desc_ptr dt;
u16 index = selector >> 3;
- int ret;
ulong addr;
- get_descriptor_table_ptr(ctxt, ops, selector, &dt);
+ get_descriptor_table_ptr(ctxt, selector, &dt);
if (dt.size < index * 8 + 7)
return emulate_gp(ctxt, selector & 0xfffc);
- addr = dt.address + index * 8;
- ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,
- &ctxt->exception);
- return ret;
+ addr = dt.address + index * 8;
+ return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc,
+ &ctxt->exception);
}
/* allowed just for 8 bytes segments */
static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
u16 selector, struct desc_struct *desc)
{
struct desc_ptr dt;
u16 index = selector >> 3;
ulong addr;
- int ret;
- get_descriptor_table_ptr(ctxt, ops, selector, &dt);
+ get_descriptor_table_ptr(ctxt, selector, &dt);
if (dt.size < index * 8 + 7)
return emulate_gp(ctxt, selector & 0xfffc);
addr = dt.address + index * 8;
- ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu,
- &ctxt->exception);
-
- return ret;
+ return ctxt->ops->write_std(ctxt, addr, desc, sizeof *desc,
+ &ctxt->exception);
}
/* Does not support long mode */
static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
u16 selector, int seg)
{
struct desc_struct seg_desc;
@@ -968,7 +1235,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
if (null_selector) /* for NULL selector skip all following checks */
goto load;
- ret = read_segment_descriptor(ctxt, ops, selector, &seg_desc);
+ ret = read_segment_descriptor(ctxt, selector, &seg_desc);
if (ret != X86EMUL_CONTINUE)
return ret;
@@ -986,7 +1253,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
rpl = selector & 3;
dpl = seg_desc.dpl;
- cpl = ops->cpl(ctxt->vcpu);
+ cpl = ctxt->ops->cpl(ctxt);
switch (seg) {
case VCPU_SREG_SS:
@@ -1037,13 +1304,12 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
if (seg_desc.s) {
/* mark segment as accessed */
seg_desc.type |= 1;
- ret = write_segment_descriptor(ctxt, ops, selector, &seg_desc);
+ ret = write_segment_descriptor(ctxt, selector, &seg_desc);
if (ret != X86EMUL_CONTINUE)
return ret;
}
load:
- ops->set_segment_selector(selector, seg, ctxt->vcpu);
- ops->set_cached_descriptor(&seg_desc, 0, seg, ctxt->vcpu);
+ ctxt->ops->set_segment(ctxt, selector, &seg_desc, 0, seg);
return X86EMUL_CONTINUE;
exception:
emulate_exception(ctxt, err_vec, err_code, true);
@@ -1069,35 +1335,32 @@ static void write_register_operand(struct operand *op)
}
}
-static inline int writeback(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static int writeback(struct x86_emulate_ctxt *ctxt)
{
int rc;
- struct decode_cache *c = &ctxt->decode;
- switch (c->dst.type) {
+ switch (ctxt->dst.type) {
case OP_REG:
- write_register_operand(&c->dst);
+ write_register_operand(&ctxt->dst);
break;
case OP_MEM:
- if (c->lock_prefix)
- rc = ops->cmpxchg_emulated(
- linear(ctxt, c->dst.addr.mem),
- &c->dst.orig_val,
- &c->dst.val,
- c->dst.bytes,
- &ctxt->exception,
- ctxt->vcpu);
+ if (ctxt->lock_prefix)
+ rc = segmented_cmpxchg(ctxt,
+ ctxt->dst.addr.mem,
+ &ctxt->dst.orig_val,
+ &ctxt->dst.val,
+ ctxt->dst.bytes);
else
- rc = ops->write_emulated(
- linear(ctxt, c->dst.addr.mem),
- &c->dst.val,
- c->dst.bytes,
- &ctxt->exception,
- ctxt->vcpu);
+ rc = segmented_write(ctxt,
+ ctxt->dst.addr.mem,
+ &ctxt->dst.val,
+ ctxt->dst.bytes);
if (rc != X86EMUL_CONTINUE)
return rc;
break;
+ case OP_XMM:
+ write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm);
+ break;
case OP_NONE:
/* no writeback */
break;
@@ -1107,47 +1370,49 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
return X86EMUL_CONTINUE;
}
-static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static int em_push(struct x86_emulate_ctxt *ctxt)
{
- struct decode_cache *c = &ctxt->decode;
+ struct segmented_address addr;
+
+ register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -ctxt->op_bytes);
+ addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]);
+ addr.seg = VCPU_SREG_SS;
- c->dst.type = OP_MEM;
- c->dst.bytes = c->op_bytes;
- c->dst.val = c->src.val;
- register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
- c->dst.addr.mem.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
- c->dst.addr.mem.seg = VCPU_SREG_SS;
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return segmented_write(ctxt, addr, &ctxt->src.val, ctxt->op_bytes);
}
static int emulate_pop(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
void *dest, int len)
{
- struct decode_cache *c = &ctxt->decode;
int rc;
struct segmented_address addr;
- addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
+ addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]);
addr.seg = VCPU_SREG_SS;
- rc = read_emulated(ctxt, ops, linear(ctxt, addr), dest, len);
+ rc = segmented_read(ctxt, addr, dest, len);
if (rc != X86EMUL_CONTINUE)
return rc;
- register_address_increment(c, &c->regs[VCPU_REGS_RSP], len);
+ register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], len);
return rc;
}
+static int em_pop(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes);
+}
+
static int emulate_popf(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- void *dest, int len)
+ void *dest, int len)
{
int rc;
unsigned long val, change_mask;
int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
- int cpl = ops->cpl(ctxt->vcpu);
+ int cpl = ctxt->ops->cpl(ctxt);
- rc = emulate_pop(ctxt, ops, &val, len);
+ rc = emulate_pop(ctxt, &val, len);
if (rc != X86EMUL_CONTINUE)
return rc;
@@ -1179,73 +1444,73 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
return rc;
}
-static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops, int seg)
+static int em_popf(struct x86_emulate_ctxt *ctxt)
{
- struct decode_cache *c = &ctxt->decode;
+ ctxt->dst.type = OP_REG;
+ ctxt->dst.addr.reg = &ctxt->eflags;
+ ctxt->dst.bytes = ctxt->op_bytes;
+ return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes);
+}
- c->src.val = ops->get_segment_selector(seg, ctxt->vcpu);
+static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
+{
+ ctxt->src.val = get_segment_selector(ctxt, seg);
- emulate_push(ctxt, ops);
+ return em_push(ctxt);
}
-static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops, int seg)
+static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, int seg)
{
- struct decode_cache *c = &ctxt->decode;
unsigned long selector;
int rc;
- rc = emulate_pop(ctxt, ops, &selector, c->op_bytes);
+ rc = emulate_pop(ctxt, &selector, ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = load_segment_descriptor(ctxt, ops, (u16)selector, seg);
+ rc = load_segment_descriptor(ctxt, (u16)selector, seg);
return rc;
}
-static int emulate_pusha(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static int em_pusha(struct x86_emulate_ctxt *ctxt)
{
- struct decode_cache *c = &ctxt->decode;
- unsigned long old_esp = c->regs[VCPU_REGS_RSP];
+ unsigned long old_esp = ctxt->regs[VCPU_REGS_RSP];
int rc = X86EMUL_CONTINUE;
int reg = VCPU_REGS_RAX;
while (reg <= VCPU_REGS_RDI) {
(reg == VCPU_REGS_RSP) ?
- (c->src.val = old_esp) : (c->src.val = c->regs[reg]);
+ (ctxt->src.val = old_esp) : (ctxt->src.val = ctxt->regs[reg]);
- emulate_push(ctxt, ops);
-
- rc = writeback(ctxt, ops);
+ rc = em_push(ctxt);
if (rc != X86EMUL_CONTINUE)
return rc;
++reg;
}
- /* Disable writeback. */
- c->dst.type = OP_NONE;
-
return rc;
}
-static int emulate_popa(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static int em_pushf(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->src.val = (unsigned long)ctxt->eflags;
+ return em_push(ctxt);
+}
+
+static int em_popa(struct x86_emulate_ctxt *ctxt)
{
- struct decode_cache *c = &ctxt->decode;
int rc = X86EMUL_CONTINUE;
int reg = VCPU_REGS_RDI;
while (reg >= VCPU_REGS_RAX) {
if (reg == VCPU_REGS_RSP) {
- register_address_increment(c, &c->regs[VCPU_REGS_RSP],
- c->op_bytes);
+ register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP],
+ ctxt->op_bytes);
--reg;
}
- rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes);
+ rc = emulate_pop(ctxt, &ctxt->regs[reg], ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
break;
--reg;
@@ -1253,10 +1518,9 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
return rc;
}
-int emulate_int_real(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops, int irq)
+int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
{
- struct decode_cache *c = &ctxt->decode;
+ struct x86_emulate_ops *ops = ctxt->ops;
int rc;
struct desc_ptr dt;
gva_t cs_addr;
@@ -1264,56 +1528,50 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
u16 cs, eip;
/* TODO: Add limit checks */
- c->src.val = ctxt->eflags;
- emulate_push(ctxt, ops);
- rc = writeback(ctxt, ops);
+ ctxt->src.val = ctxt->eflags;
+ rc = em_push(ctxt);
if (rc != X86EMUL_CONTINUE)
return rc;
ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC);
- c->src.val = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
- emulate_push(ctxt, ops);
- rc = writeback(ctxt, ops);
+ ctxt->src.val = get_segment_selector(ctxt, VCPU_SREG_CS);
+ rc = em_push(ctxt);
if (rc != X86EMUL_CONTINUE)
return rc;
- c->src.val = c->eip;
- emulate_push(ctxt, ops);
- rc = writeback(ctxt, ops);
+ ctxt->src.val = ctxt->_eip;
+ rc = em_push(ctxt);
if (rc != X86EMUL_CONTINUE)
return rc;
- c->dst.type = OP_NONE;
-
- ops->get_idt(&dt, ctxt->vcpu);
+ ops->get_idt(ctxt, &dt);
eip_addr = dt.address + (irq << 2);
cs_addr = dt.address + (irq << 2) + 2;
- rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &ctxt->exception);
+ rc = ops->read_std(ctxt, cs_addr, &cs, 2, &ctxt->exception);
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &ctxt->exception);
+ rc = ops->read_std(ctxt, eip_addr, &eip, 2, &ctxt->exception);
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = load_segment_descriptor(ctxt, ops, cs, VCPU_SREG_CS);
+ rc = load_segment_descriptor(ctxt, cs, VCPU_SREG_CS);
if (rc != X86EMUL_CONTINUE)
return rc;
- c->eip = eip;
+ ctxt->_eip = eip;
return rc;
}
-static int emulate_int(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops, int irq)
+static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq)
{
switch(ctxt->mode) {
case X86EMUL_MODE_REAL:
- return emulate_int_real(ctxt, ops, irq);
+ return emulate_int_real(ctxt, irq);
case X86EMUL_MODE_VM86:
case X86EMUL_MODE_PROT16:
case X86EMUL_MODE_PROT32:
@@ -1324,10 +1582,8 @@ static int emulate_int(struct x86_emulate_ctxt *ctxt,
}
}
-static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
{
- struct decode_cache *c = &ctxt->decode;
int rc = X86EMUL_CONTINUE;
unsigned long temp_eip = 0;
unsigned long temp_eflags = 0;
@@ -1339,7 +1595,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
/* TODO: Add stack limit check */
- rc = emulate_pop(ctxt, ops, &temp_eip, c->op_bytes);
+ rc = emulate_pop(ctxt, &temp_eip, ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
return rc;
@@ -1347,27 +1603,27 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
if (temp_eip & ~0xffff)
return emulate_gp(ctxt, 0);
- rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
+ rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = emulate_pop(ctxt, ops, &temp_eflags, c->op_bytes);
+ rc = emulate_pop(ctxt, &temp_eflags, ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
+ rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS);
if (rc != X86EMUL_CONTINUE)
return rc;
- c->eip = temp_eip;
+ ctxt->_eip = temp_eip;
- if (c->op_bytes == 4)
+ if (ctxt->op_bytes == 4)
ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask));
- else if (c->op_bytes == 2) {
+ else if (ctxt->op_bytes == 2) {
ctxt->eflags &= ~0xffff;
ctxt->eflags |= temp_eflags;
}
@@ -1378,12 +1634,11 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
return rc;
}
-static inline int emulate_iret(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops* ops)
+static int em_iret(struct x86_emulate_ctxt *ctxt)
{
switch(ctxt->mode) {
case X86EMUL_MODE_REAL:
- return emulate_iret_real(ctxt, ops);
+ return emulate_iret_real(ctxt);
case X86EMUL_MODE_VM86:
case X86EMUL_MODE_PROT16:
case X86EMUL_MODE_PROT32:
@@ -1394,73 +1649,84 @@ static inline int emulate_iret(struct x86_emulate_ctxt *ctxt,
}
}
-static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
{
- struct decode_cache *c = &ctxt->decode;
+ int rc;
+ unsigned short sel;
- return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes);
+ memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
+
+ rc = load_segment_descriptor(ctxt, sel, VCPU_SREG_CS);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ctxt->_eip = 0;
+ memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes);
+ return X86EMUL_CONTINUE;
}
-static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
+static int em_grp1a(struct x86_emulate_ctxt *ctxt)
{
- struct decode_cache *c = &ctxt->decode;
- switch (c->modrm_reg) {
+ return emulate_pop(ctxt, &ctxt->dst.val, ctxt->dst.bytes);
+}
+
+static int em_grp2(struct x86_emulate_ctxt *ctxt)
+{
+ switch (ctxt->modrm_reg) {
case 0: /* rol */
- emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags);
+ emulate_2op_SrcB("rol", ctxt->src, ctxt->dst, ctxt->eflags);
break;
case 1: /* ror */
- emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags);
+ emulate_2op_SrcB("ror", ctxt->src, ctxt->dst, ctxt->eflags);
break;
case 2: /* rcl */
- emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags);
+ emulate_2op_SrcB("rcl", ctxt->src, ctxt->dst, ctxt->eflags);
break;
case 3: /* rcr */
- emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags);
+ emulate_2op_SrcB("rcr", ctxt->src, ctxt->dst, ctxt->eflags);
break;
case 4: /* sal/shl */
case 6: /* sal/shl */
- emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags);
+ emulate_2op_SrcB("sal", ctxt->src, ctxt->dst, ctxt->eflags);
break;
case 5: /* shr */
- emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags);
+ emulate_2op_SrcB("shr", ctxt->src, ctxt->dst, ctxt->eflags);
break;
case 7: /* sar */
- emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags);
+ emulate_2op_SrcB("sar", ctxt->src, ctxt->dst, ctxt->eflags);
break;
}
+ return X86EMUL_CONTINUE;
}
-static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static int em_grp3(struct x86_emulate_ctxt *ctxt)
{
- struct decode_cache *c = &ctxt->decode;
- unsigned long *rax = &c->regs[VCPU_REGS_RAX];
- unsigned long *rdx = &c->regs[VCPU_REGS_RDX];
+ unsigned long *rax = &ctxt->regs[VCPU_REGS_RAX];
+ unsigned long *rdx = &ctxt->regs[VCPU_REGS_RDX];
u8 de = 0;
- switch (c->modrm_reg) {
+ switch (ctxt->modrm_reg) {
case 0 ... 1: /* test */
- emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
+ emulate_2op_SrcV("test", ctxt->src, ctxt->dst, ctxt->eflags);
break;
case 2: /* not */
- c->dst.val = ~c->dst.val;
+ ctxt->dst.val = ~ctxt->dst.val;
break;
case 3: /* neg */
- emulate_1op("neg", c->dst, ctxt->eflags);
+ emulate_1op("neg", ctxt->dst, ctxt->eflags);
break;
case 4: /* mul */
- emulate_1op_rax_rdx("mul", c->src, *rax, *rdx, ctxt->eflags);
+ emulate_1op_rax_rdx("mul", ctxt->src, *rax, *rdx, ctxt->eflags);
break;
case 5: /* imul */
- emulate_1op_rax_rdx("imul", c->src, *rax, *rdx, ctxt->eflags);
+ emulate_1op_rax_rdx("imul", ctxt->src, *rax, *rdx, ctxt->eflags);
break;
case 6: /* div */
- emulate_1op_rax_rdx_ex("div", c->src, *rax, *rdx,
+ emulate_1op_rax_rdx_ex("div", ctxt->src, *rax, *rdx,
ctxt->eflags, de);
break;
case 7: /* idiv */
- emulate_1op_rax_rdx_ex("idiv", c->src, *rax, *rdx,
+ emulate_1op_rax_rdx_ex("idiv", ctxt->src, *rax, *rdx,
ctxt->eflags, de);
break;
default:
@@ -1471,99 +1737,104 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
return X86EMUL_CONTINUE;
}
-static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static int em_grp45(struct x86_emulate_ctxt *ctxt)
{
- struct decode_cache *c = &ctxt->decode;
+ int rc = X86EMUL_CONTINUE;
- switch (c->modrm_reg) {
+ switch (ctxt->modrm_reg) {
case 0: /* inc */
- emulate_1op("inc", c->dst, ctxt->eflags);
+ emulate_1op("inc", ctxt->dst, ctxt->eflags);
break;
case 1: /* dec */
- emulate_1op("dec", c->dst, ctxt->eflags);
+ emulate_1op("dec", ctxt->dst, ctxt->eflags);
break;
case 2: /* call near abs */ {
long int old_eip;
- old_eip = c->eip;
- c->eip = c->src.val;
- c->src.val = old_eip;
- emulate_push(ctxt, ops);
+ old_eip = ctxt->_eip;
+ ctxt->_eip = ctxt->src.val;
+ ctxt->src.val = old_eip;
+ rc = em_push(ctxt);
break;
}
case 4: /* jmp abs */
- c->eip = c->src.val;
+ ctxt->_eip = ctxt->src.val;
+ break;
+ case 5: /* jmp far */
+ rc = em_jmp_far(ctxt);
break;
case 6: /* push */
- emulate_push(ctxt, ops);
+ rc = em_push(ctxt);
break;
}
- return X86EMUL_CONTINUE;
+ return rc;
}
-static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static int em_grp9(struct x86_emulate_ctxt *ctxt)
{
- struct decode_cache *c = &ctxt->decode;
- u64 old = c->dst.orig_val64;
+ u64 old = ctxt->dst.orig_val64;
- if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
- ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
- c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
- c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
+ if (((u32) (old >> 0) != (u32) ctxt->regs[VCPU_REGS_RAX]) ||
+ ((u32) (old >> 32) != (u32) ctxt->regs[VCPU_REGS_RDX])) {
+ ctxt->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
+ ctxt->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
ctxt->eflags &= ~EFLG_ZF;
} else {
- c->dst.val64 = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
- (u32) c->regs[VCPU_REGS_RBX];
+ ctxt->dst.val64 = ((u64)ctxt->regs[VCPU_REGS_RCX] << 32) |
+ (u32) ctxt->regs[VCPU_REGS_RBX];
ctxt->eflags |= EFLG_ZF;
}
return X86EMUL_CONTINUE;
}
-static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static int em_ret(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->dst.type = OP_REG;
+ ctxt->dst.addr.reg = &ctxt->_eip;
+ ctxt->dst.bytes = ctxt->op_bytes;
+ return em_pop(ctxt);
+}
+
+static int em_ret_far(struct x86_emulate_ctxt *ctxt)
{
- struct decode_cache *c = &ctxt->decode;
int rc;
unsigned long cs;
- rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes);
+ rc = emulate_pop(ctxt, &ctxt->_eip, ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
return rc;
- if (c->op_bytes == 4)
- c->eip = (u32)c->eip;
- rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
+ if (ctxt->op_bytes == 4)
+ ctxt->_eip = (u32)ctxt->_eip;
+ rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
+ rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS);
return rc;
}
-static int emulate_load_segment(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops, int seg)
+static int emulate_load_segment(struct x86_emulate_ctxt *ctxt, int seg)
{
- struct decode_cache *c = &ctxt->decode;
unsigned short sel;
int rc;
- memcpy(&sel, c->src.valptr + c->op_bytes, 2);
+ memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
- rc = load_segment_descriptor(ctxt, ops, sel, seg);
+ rc = load_segment_descriptor(ctxt, sel, seg);
if (rc != X86EMUL_CONTINUE)
return rc;
- c->dst.val = c->src.val;
+ ctxt->dst.val = ctxt->src.val;
return rc;
}
-static inline void
+static void
setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops, struct desc_struct *cs,
- struct desc_struct *ss)
+ struct desc_struct *cs, struct desc_struct *ss)
{
+ u16 selector;
+
memset(cs, 0, sizeof(struct desc_struct));
- ops->get_cached_descriptor(cs, NULL, VCPU_SREG_CS, ctxt->vcpu);
+ ctxt->ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS);
memset(ss, 0, sizeof(struct desc_struct));
cs->l = 0; /* will be adjusted later */
@@ -1586,52 +1857,51 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
ss->p = 1;
}
-static int
-emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+static int em_syscall(struct x86_emulate_ctxt *ctxt)
{
- struct decode_cache *c = &ctxt->decode;
+ struct x86_emulate_ops *ops = ctxt->ops;
struct desc_struct cs, ss;
u64 msr_data;
u16 cs_sel, ss_sel;
+ u64 efer = 0;
/* syscall is not available in real mode */
if (ctxt->mode == X86EMUL_MODE_REAL ||
ctxt->mode == X86EMUL_MODE_VM86)
return emulate_ud(ctxt);
- setup_syscalls_segments(ctxt, ops, &cs, &ss);
+ ops->get_msr(ctxt, MSR_EFER, &efer);
+ setup_syscalls_segments(ctxt, &cs, &ss);
- ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
+ ops->get_msr(ctxt, MSR_STAR, &msr_data);
msr_data >>= 32;
cs_sel = (u16)(msr_data & 0xfffc);
ss_sel = (u16)(msr_data + 8);
- if (is_long_mode(ctxt->vcpu)) {
+ if (efer & EFER_LMA) {
cs.d = 0;
cs.l = 1;
}
- ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu);
- ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
- ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
- ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
+ ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
+ ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
- c->regs[VCPU_REGS_RCX] = c->eip;
- if (is_long_mode(ctxt->vcpu)) {
+ ctxt->regs[VCPU_REGS_RCX] = ctxt->_eip;
+ if (efer & EFER_LMA) {
#ifdef CONFIG_X86_64
- c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
+ ctxt->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
- ops->get_msr(ctxt->vcpu,
+ ops->get_msr(ctxt,
ctxt->mode == X86EMUL_MODE_PROT64 ?
MSR_LSTAR : MSR_CSTAR, &msr_data);
- c->eip = msr_data;
+ ctxt->_eip = msr_data;
- ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data);
+ ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data);
ctxt->eflags &= ~(msr_data | EFLG_RF);
#endif
} else {
/* legacy mode */
- ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
- c->eip = (u32)msr_data;
+ ops->get_msr(ctxt, MSR_STAR, &msr_data);
+ ctxt->_eip = (u32)msr_data;
ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
}
@@ -1639,14 +1909,15 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
return X86EMUL_CONTINUE;
}
-static int
-emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+static int em_sysenter(struct x86_emulate_ctxt *ctxt)
{
- struct decode_cache *c = &ctxt->decode;
+ struct x86_emulate_ops *ops = ctxt->ops;
struct desc_struct cs, ss;
u64 msr_data;
u16 cs_sel, ss_sel;
+ u64 efer = 0;
+ ops->get_msr(ctxt, MSR_EFER, &efer);
/* inject #GP if in real mode */
if (ctxt->mode == X86EMUL_MODE_REAL)
return emulate_gp(ctxt, 0);
@@ -1657,9 +1928,9 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
if (ctxt->mode == X86EMUL_MODE_PROT64)
return emulate_ud(ctxt);
- setup_syscalls_segments(ctxt, ops, &cs, &ss);
+ setup_syscalls_segments(ctxt, &cs, &ss);
- ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
+ ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
switch (ctxt->mode) {
case X86EMUL_MODE_PROT32:
if ((msr_data & 0xfffc) == 0x0)
@@ -1676,50 +1947,46 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
cs_sel &= ~SELECTOR_RPL_MASK;
ss_sel = cs_sel + 8;
ss_sel &= ~SELECTOR_RPL_MASK;
- if (ctxt->mode == X86EMUL_MODE_PROT64
- || is_long_mode(ctxt->vcpu)) {
+ if (ctxt->mode == X86EMUL_MODE_PROT64 || (efer & EFER_LMA)) {
cs.d = 0;
cs.l = 1;
}
- ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu);
- ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
- ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
- ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
+ ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
+ ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
- ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
- c->eip = msr_data;
+ ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data);
+ ctxt->_eip = msr_data;
- ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
- c->regs[VCPU_REGS_RSP] = msr_data;
+ ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data);
+ ctxt->regs[VCPU_REGS_RSP] = msr_data;
return X86EMUL_CONTINUE;
}
-static int
-emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+static int em_sysexit(struct x86_emulate_ctxt *ctxt)
{
- struct decode_cache *c = &ctxt->decode;
+ struct x86_emulate_ops *ops = ctxt->ops;
struct desc_struct cs, ss;
u64 msr_data;
int usermode;
- u16 cs_sel, ss_sel;
+ u16 cs_sel = 0, ss_sel = 0;
/* inject #GP if in real mode or Virtual 8086 mode */
if (ctxt->mode == X86EMUL_MODE_REAL ||
ctxt->mode == X86EMUL_MODE_VM86)
return emulate_gp(ctxt, 0);
- setup_syscalls_segments(ctxt, ops, &cs, &ss);
+ setup_syscalls_segments(ctxt, &cs, &ss);
- if ((c->rex_prefix & 0x8) != 0x0)
+ if ((ctxt->rex_prefix & 0x8) != 0x0)
usermode = X86EMUL_MODE_PROT64;
else
usermode = X86EMUL_MODE_PROT32;
cs.dpl = 3;
ss.dpl = 3;
- ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
+ ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
switch (usermode) {
case X86EMUL_MODE_PROT32:
cs_sel = (u16)(msr_data + 16);
@@ -1739,19 +2006,16 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
cs_sel |= SELECTOR_RPL_MASK;
ss_sel |= SELECTOR_RPL_MASK;
- ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu);
- ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
- ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
- ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
+ ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
+ ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
- c->eip = c->regs[VCPU_REGS_RDX];
- c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX];
+ ctxt->_eip = ctxt->regs[VCPU_REGS_RDX];
+ ctxt->regs[VCPU_REGS_RSP] = ctxt->regs[VCPU_REGS_RCX];
return X86EMUL_CONTINUE;
}
-static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
{
int iopl;
if (ctxt->mode == X86EMUL_MODE_REAL)
@@ -1759,21 +2023,21 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt,
if (ctxt->mode == X86EMUL_MODE_VM86)
return true;
iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
- return ops->cpl(ctxt->vcpu) > iopl;
+ return ctxt->ops->cpl(ctxt) > iopl;
}
static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
u16 port, u16 len)
{
+ struct x86_emulate_ops *ops = ctxt->ops;
struct desc_struct tr_seg;
u32 base3;
int r;
- u16 io_bitmap_ptr, perm, bit_idx = port & 0x7;
+ u16 tr, io_bitmap_ptr, perm, bit_idx = port & 0x7;
unsigned mask = (1 << len) - 1;
unsigned long base;
- ops->get_cached_descriptor(&tr_seg, &base3, VCPU_SREG_TR, ctxt->vcpu);
+ ops->get_segment(ctxt, &tr, &tr_seg, &base3, VCPU_SREG_TR);
if (!tr_seg.p)
return false;
if (desc_limit_scaled(&tr_seg) < 103)
@@ -1782,13 +2046,12 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
#ifdef CONFIG_X86_64
base |= ((u64)base3) << 32;
#endif
- r = ops->read_std(base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, NULL);
+ r = ops->read_std(ctxt, base + 102, &io_bitmap_ptr, 2, NULL);
if (r != X86EMUL_CONTINUE)
return false;
if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
return false;
- r = ops->read_std(base + io_bitmap_ptr + port/8, &perm, 2, ctxt->vcpu,
- NULL);
+ r = ops->read_std(ctxt, base + io_bitmap_ptr + port/8, &perm, 2, NULL);
if (r != X86EMUL_CONTINUE)
return false;
if ((perm >> bit_idx) & mask)
@@ -1797,14 +2060,13 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
}
static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
u16 port, u16 len)
{
if (ctxt->perm_ok)
return true;
- if (emulator_bad_iopl(ctxt, ops))
- if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
+ if (emulator_bad_iopl(ctxt))
+ if (!emulator_io_port_access_allowed(ctxt, port, len))
return false;
ctxt->perm_ok = true;
@@ -1813,74 +2075,69 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
}
static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
struct tss_segment_16 *tss)
{
- struct decode_cache *c = &ctxt->decode;
-
- tss->ip = c->eip;
+ tss->ip = ctxt->_eip;
tss->flag = ctxt->eflags;
- tss->ax = c->regs[VCPU_REGS_RAX];
- tss->cx = c->regs[VCPU_REGS_RCX];
- tss->dx = c->regs[VCPU_REGS_RDX];
- tss->bx = c->regs[VCPU_REGS_RBX];
- tss->sp = c->regs[VCPU_REGS_RSP];
- tss->bp = c->regs[VCPU_REGS_RBP];
- tss->si = c->regs[VCPU_REGS_RSI];
- tss->di = c->regs[VCPU_REGS_RDI];
-
- tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu);
- tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
- tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu);
- tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu);
- tss->ldt = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu);
+ tss->ax = ctxt->regs[VCPU_REGS_RAX];
+ tss->cx = ctxt->regs[VCPU_REGS_RCX];
+ tss->dx = ctxt->regs[VCPU_REGS_RDX];
+ tss->bx = ctxt->regs[VCPU_REGS_RBX];
+ tss->sp = ctxt->regs[VCPU_REGS_RSP];
+ tss->bp = ctxt->regs[VCPU_REGS_RBP];
+ tss->si = ctxt->regs[VCPU_REGS_RSI];
+ tss->di = ctxt->regs[VCPU_REGS_RDI];
+
+ tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
+ tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
+ tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS);
+ tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS);
+ tss->ldt = get_segment_selector(ctxt, VCPU_SREG_LDTR);
}
static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
struct tss_segment_16 *tss)
{
- struct decode_cache *c = &ctxt->decode;
int ret;
- c->eip = tss->ip;
+ ctxt->_eip = tss->ip;
ctxt->eflags = tss->flag | 2;
- c->regs[VCPU_REGS_RAX] = tss->ax;
- c->regs[VCPU_REGS_RCX] = tss->cx;
- c->regs[VCPU_REGS_RDX] = tss->dx;
- c->regs[VCPU_REGS_RBX] = tss->bx;
- c->regs[VCPU_REGS_RSP] = tss->sp;
- c->regs[VCPU_REGS_RBP] = tss->bp;
- c->regs[VCPU_REGS_RSI] = tss->si;
- c->regs[VCPU_REGS_RDI] = tss->di;
+ ctxt->regs[VCPU_REGS_RAX] = tss->ax;
+ ctxt->regs[VCPU_REGS_RCX] = tss->cx;
+ ctxt->regs[VCPU_REGS_RDX] = tss->dx;
+ ctxt->regs[VCPU_REGS_RBX] = tss->bx;
+ ctxt->regs[VCPU_REGS_RSP] = tss->sp;
+ ctxt->regs[VCPU_REGS_RBP] = tss->bp;
+ ctxt->regs[VCPU_REGS_RSI] = tss->si;
+ ctxt->regs[VCPU_REGS_RDI] = tss->di;
/*
* SDM says that segment selectors are loaded before segment
* descriptors
*/
- ops->set_segment_selector(tss->ldt, VCPU_SREG_LDTR, ctxt->vcpu);
- ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu);
- ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu);
- ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu);
- ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu);
+ set_segment_selector(ctxt, tss->ldt, VCPU_SREG_LDTR);
+ set_segment_selector(ctxt, tss->es, VCPU_SREG_ES);
+ set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS);
+ set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS);
+ set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
/*
* Now load segment descriptors. If fault happenes at this stage
* it is handled in a context of new task
*/
- ret = load_segment_descriptor(ctxt, ops, tss->ldt, VCPU_SREG_LDTR);
+ ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES);
+ ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS);
+ ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS);
+ ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS);
+ ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS);
if (ret != X86EMUL_CONTINUE)
return ret;
@@ -1888,29 +2145,29 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
}
static int task_switch_16(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
u16 tss_selector, u16 old_tss_sel,
ulong old_tss_base, struct desc_struct *new_desc)
{
+ struct x86_emulate_ops *ops = ctxt->ops;
struct tss_segment_16 tss_seg;
int ret;
u32 new_tss_base = get_desc_base(new_desc);
- ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+ ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
return ret;
- save_state_to_tss16(ctxt, ops, &tss_seg);
+ save_state_to_tss16(ctxt, &tss_seg);
- ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+ ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
return ret;
- ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+ ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg,
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
@@ -1919,100 +2176,95 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
if (old_tss_sel != 0xffff) {
tss_seg.prev_task_link = old_tss_sel;
- ret = ops->write_std(new_tss_base,
+ ret = ops->write_std(ctxt, new_tss_base,
&tss_seg.prev_task_link,
sizeof tss_seg.prev_task_link,
- ctxt->vcpu, &ctxt->exception);
+ &ctxt->exception);
if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
return ret;
}
- return load_state_from_tss16(ctxt, ops, &tss_seg);
+ return load_state_from_tss16(ctxt, &tss_seg);
}
static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
struct tss_segment_32 *tss)
{
- struct decode_cache *c = &ctxt->decode;
-
- tss->cr3 = ops->get_cr(3, ctxt->vcpu);
- tss->eip = c->eip;
+ tss->cr3 = ctxt->ops->get_cr(ctxt, 3);
+ tss->eip = ctxt->_eip;
tss->eflags = ctxt->eflags;
- tss->eax = c->regs[VCPU_REGS_RAX];
- tss->ecx = c->regs[VCPU_REGS_RCX];
- tss->edx = c->regs[VCPU_REGS_RDX];
- tss->ebx = c->regs[VCPU_REGS_RBX];
- tss->esp = c->regs[VCPU_REGS_RSP];
- tss->ebp = c->regs[VCPU_REGS_RBP];
- tss->esi = c->regs[VCPU_REGS_RSI];
- tss->edi = c->regs[VCPU_REGS_RDI];
-
- tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu);
- tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
- tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu);
- tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu);
- tss->fs = ops->get_segment_selector(VCPU_SREG_FS, ctxt->vcpu);
- tss->gs = ops->get_segment_selector(VCPU_SREG_GS, ctxt->vcpu);
- tss->ldt_selector = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu);
+ tss->eax = ctxt->regs[VCPU_REGS_RAX];
+ tss->ecx = ctxt->regs[VCPU_REGS_RCX];
+ tss->edx = ctxt->regs[VCPU_REGS_RDX];
+ tss->ebx = ctxt->regs[VCPU_REGS_RBX];
+ tss->esp = ctxt->regs[VCPU_REGS_RSP];
+ tss->ebp = ctxt->regs[VCPU_REGS_RBP];
+ tss->esi = ctxt->regs[VCPU_REGS_RSI];
+ tss->edi = ctxt->regs[VCPU_REGS_RDI];
+
+ tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
+ tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
+ tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS);
+ tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS);
+ tss->fs = get_segment_selector(ctxt, VCPU_SREG_FS);
+ tss->gs = get_segment_selector(ctxt, VCPU_SREG_GS);
+ tss->ldt_selector = get_segment_selector(ctxt, VCPU_SREG_LDTR);
}
static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
struct tss_segment_32 *tss)
{
- struct decode_cache *c = &ctxt->decode;
int ret;
- if (ops->set_cr(3, tss->cr3, ctxt->vcpu))
+ if (ctxt->ops->set_cr(ctxt, 3, tss->cr3))
return emulate_gp(ctxt, 0);
- c->eip = tss->eip;
+ ctxt->_eip = tss->eip;
ctxt->eflags = tss->eflags | 2;
- c->regs[VCPU_REGS_RAX] = tss->eax;
- c->regs[VCPU_REGS_RCX] = tss->ecx;
- c->regs[VCPU_REGS_RDX] = tss->edx;
- c->regs[VCPU_REGS_RBX] = tss->ebx;
- c->regs[VCPU_REGS_RSP] = tss->esp;
- c->regs[VCPU_REGS_RBP] = tss->ebp;
- c->regs[VCPU_REGS_RSI] = tss->esi;
- c->regs[VCPU_REGS_RDI] = tss->edi;
+ ctxt->regs[VCPU_REGS_RAX] = tss->eax;
+ ctxt->regs[VCPU_REGS_RCX] = tss->ecx;
+ ctxt->regs[VCPU_REGS_RDX] = tss->edx;
+ ctxt->regs[VCPU_REGS_RBX] = tss->ebx;
+ ctxt->regs[VCPU_REGS_RSP] = tss->esp;
+ ctxt->regs[VCPU_REGS_RBP] = tss->ebp;
+ ctxt->regs[VCPU_REGS_RSI] = tss->esi;
+ ctxt->regs[VCPU_REGS_RDI] = tss->edi;
/*
* SDM says that segment selectors are loaded before segment
* descriptors
*/
- ops->set_segment_selector(tss->ldt_selector, VCPU_SREG_LDTR, ctxt->vcpu);
- ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu);
- ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu);
- ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu);
- ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu);
- ops->set_segment_selector(tss->fs, VCPU_SREG_FS, ctxt->vcpu);
- ops->set_segment_selector(tss->gs, VCPU_SREG_GS, ctxt->vcpu);
+ set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR);
+ set_segment_selector(ctxt, tss->es, VCPU_SREG_ES);
+ set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS);
+ set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS);
+ set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
+ set_segment_selector(ctxt, tss->fs, VCPU_SREG_FS);
+ set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS);
/*
* Now load segment descriptors. If fault happenes at this stage
* it is handled in a context of new task
*/
- ret = load_segment_descriptor(ctxt, ops, tss->ldt_selector, VCPU_SREG_LDTR);
+ ret = load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES);
+ ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS);
+ ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS);
+ ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS);
+ ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = load_segment_descriptor(ctxt, ops, tss->fs, VCPU_SREG_FS);
+ ret = load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = load_segment_descriptor(ctxt, ops, tss->gs, VCPU_SREG_GS);
+ ret = load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS);
if (ret != X86EMUL_CONTINUE)
return ret;
@@ -2020,29 +2272,29 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
}
static int task_switch_32(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
u16 tss_selector, u16 old_tss_sel,
ulong old_tss_base, struct desc_struct *new_desc)
{
+ struct x86_emulate_ops *ops = ctxt->ops;
struct tss_segment_32 tss_seg;
int ret;
u32 new_tss_base = get_desc_base(new_desc);
- ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+ ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
return ret;
- save_state_to_tss32(ctxt, ops, &tss_seg);
+ save_state_to_tss32(ctxt, &tss_seg);
- ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+ ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
return ret;
- ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+ ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg,
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
@@ -2051,36 +2303,36 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
if (old_tss_sel != 0xffff) {
tss_seg.prev_task_link = old_tss_sel;
- ret = ops->write_std(new_tss_base,
+ ret = ops->write_std(ctxt, new_tss_base,
&tss_seg.prev_task_link,
sizeof tss_seg.prev_task_link,
- ctxt->vcpu, &ctxt->exception);
+ &ctxt->exception);
if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
return ret;
}
- return load_state_from_tss32(ctxt, ops, &tss_seg);
+ return load_state_from_tss32(ctxt, &tss_seg);
}
static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
u16 tss_selector, int reason,
bool has_error_code, u32 error_code)
{
+ struct x86_emulate_ops *ops = ctxt->ops;
struct desc_struct curr_tss_desc, next_tss_desc;
int ret;
- u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu);
+ u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR);
ulong old_tss_base =
- ops->get_cached_segment_base(VCPU_SREG_TR, ctxt->vcpu);
+ ops->get_cached_segment_base(ctxt, VCPU_SREG_TR);
u32 desc_limit;
/* FIXME: old_tss_base == ~0 ? */
- ret = read_segment_descriptor(ctxt, ops, tss_selector, &next_tss_desc);
+ ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = read_segment_descriptor(ctxt, ops, old_tss_sel, &curr_tss_desc);
+ ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc);
if (ret != X86EMUL_CONTINUE)
return ret;
@@ -2088,7 +2340,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
if (reason != TASK_SWITCH_IRET) {
if ((tss_selector & 3) > next_tss_desc.dpl ||
- ops->cpl(ctxt->vcpu) > next_tss_desc.dpl)
+ ops->cpl(ctxt) > next_tss_desc.dpl)
return emulate_gp(ctxt, 0);
}
@@ -2102,8 +2354,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */
- write_segment_descriptor(ctxt, ops, old_tss_sel,
- &curr_tss_desc);
+ write_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc);
}
if (reason == TASK_SWITCH_IRET)
@@ -2115,10 +2366,10 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
old_tss_sel = 0xffff;
if (next_tss_desc.type & 8)
- ret = task_switch_32(ctxt, ops, tss_selector, old_tss_sel,
+ ret = task_switch_32(ctxt, tss_selector, old_tss_sel,
old_tss_base, &next_tss_desc);
else
- ret = task_switch_16(ctxt, ops, tss_selector, old_tss_sel,
+ ret = task_switch_16(ctxt, tss_selector, old_tss_sel,
old_tss_base, &next_tss_desc);
if (ret != X86EMUL_CONTINUE)
return ret;
@@ -2128,21 +2379,17 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
if (reason != TASK_SWITCH_IRET) {
next_tss_desc.type |= (1 << 1); /* set busy flag */
- write_segment_descriptor(ctxt, ops, tss_selector,
- &next_tss_desc);
+ write_segment_descriptor(ctxt, tss_selector, &next_tss_desc);
}
- ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu);
- ops->set_cached_descriptor(&next_tss_desc, 0, VCPU_SREG_TR, ctxt->vcpu);
- ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu);
+ ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS);
+ ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR);
if (has_error_code) {
- struct decode_cache *c = &ctxt->decode;
-
- c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
- c->lock_prefix = 0;
- c->src.val = (unsigned long) error_code;
- emulate_push(ctxt, ops);
+ ctxt->op_bytes = ctxt->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
+ ctxt->lock_prefix = 0;
+ ctxt->src.val = (unsigned long) error_code;
+ ret = em_push(ctxt);
}
return ret;
@@ -2152,50 +2399,37 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
u16 tss_selector, int reason,
bool has_error_code, u32 error_code)
{
- struct x86_emulate_ops *ops = ctxt->ops;
- struct decode_cache *c = &ctxt->decode;
int rc;
- c->eip = ctxt->eip;
- c->dst.type = OP_NONE;
+ ctxt->_eip = ctxt->eip;
+ ctxt->dst.type = OP_NONE;
- rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
+ rc = emulator_do_task_switch(ctxt, tss_selector, reason,
has_error_code, error_code);
- if (rc == X86EMUL_CONTINUE) {
- rc = writeback(ctxt, ops);
- if (rc == X86EMUL_CONTINUE)
- ctxt->eip = c->eip;
- }
+ if (rc == X86EMUL_CONTINUE)
+ ctxt->eip = ctxt->_eip;
- return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+ return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
}
static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg,
int reg, struct operand *op)
{
- struct decode_cache *c = &ctxt->decode;
int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
- register_address_increment(c, &c->regs[reg], df * op->bytes);
- op->addr.mem.ea = register_address(c, c->regs[reg]);
+ register_address_increment(ctxt, &ctxt->regs[reg], df * op->bytes);
+ op->addr.mem.ea = register_address(ctxt, ctxt->regs[reg]);
op->addr.mem.seg = seg;
}
-static int em_push(struct x86_emulate_ctxt *ctxt)
-{
- emulate_push(ctxt, ctxt->ops);
- return X86EMUL_CONTINUE;
-}
-
static int em_das(struct x86_emulate_ctxt *ctxt)
{
- struct decode_cache *c = &ctxt->decode;
u8 al, old_al;
bool af, cf, old_cf;
cf = ctxt->eflags & X86_EFLAGS_CF;
- al = c->dst.val;
+ al = ctxt->dst.val;
old_al = al;
old_cf = cf;
@@ -2213,12 +2447,12 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
cf = true;
}
- c->dst.val = al;
+ ctxt->dst.val = al;
/* Set PF, ZF, SF */
- c->src.type = OP_IMM;
- c->src.val = 0;
- c->src.bytes = 1;
- emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
+ ctxt->src.type = OP_IMM;
+ ctxt->src.val = 0;
+ ctxt->src.bytes = 1;
+ emulate_2op_SrcV("or", ctxt->src, ctxt->dst, ctxt->eflags);
ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF);
if (cf)
ctxt->eflags |= X86_EFLAGS_CF;
@@ -2229,118 +2463,561 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
static int em_call_far(struct x86_emulate_ctxt *ctxt)
{
- struct decode_cache *c = &ctxt->decode;
u16 sel, old_cs;
ulong old_eip;
int rc;
- old_cs = ctxt->ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
- old_eip = c->eip;
+ old_cs = get_segment_selector(ctxt, VCPU_SREG_CS);
+ old_eip = ctxt->_eip;
- memcpy(&sel, c->src.valptr + c->op_bytes, 2);
- if (load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS))
+ memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
+ if (load_segment_descriptor(ctxt, sel, VCPU_SREG_CS))
return X86EMUL_CONTINUE;
- c->eip = 0;
- memcpy(&c->eip, c->src.valptr, c->op_bytes);
+ ctxt->_eip = 0;
+ memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes);
- c->src.val = old_cs;
- emulate_push(ctxt, ctxt->ops);
- rc = writeback(ctxt, ctxt->ops);
+ ctxt->src.val = old_cs;
+ rc = em_push(ctxt);
if (rc != X86EMUL_CONTINUE)
return rc;
- c->src.val = old_eip;
- emulate_push(ctxt, ctxt->ops);
- rc = writeback(ctxt, ctxt->ops);
+ ctxt->src.val = old_eip;
+ return em_push(ctxt);
+}
+
+static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+
+ ctxt->dst.type = OP_REG;
+ ctxt->dst.addr.reg = &ctxt->_eip;
+ ctxt->dst.bytes = ctxt->op_bytes;
+ rc = emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
return rc;
+ register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], ctxt->src.val);
+ return X86EMUL_CONTINUE;
+}
- c->dst.type = OP_NONE;
+static int em_add(struct x86_emulate_ctxt *ctxt)
+{
+ emulate_2op_SrcV("add", ctxt->src, ctxt->dst, ctxt->eflags);
+ return X86EMUL_CONTINUE;
+}
+static int em_or(struct x86_emulate_ctxt *ctxt)
+{
+ emulate_2op_SrcV("or", ctxt->src, ctxt->dst, ctxt->eflags);
return X86EMUL_CONTINUE;
}
-static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
+static int em_adc(struct x86_emulate_ctxt *ctxt)
{
- struct decode_cache *c = &ctxt->decode;
- int rc;
+ emulate_2op_SrcV("adc", ctxt->src, ctxt->dst, ctxt->eflags);
+ return X86EMUL_CONTINUE;
+}
- c->dst.type = OP_REG;
- c->dst.addr.reg = &c->eip;
- c->dst.bytes = c->op_bytes;
- rc = emulate_pop(ctxt, ctxt->ops, &c->dst.val, c->op_bytes);
- if (rc != X86EMUL_CONTINUE)
- return rc;
- register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val);
+static int em_sbb(struct x86_emulate_ctxt *ctxt)
+{
+ emulate_2op_SrcV("sbb", ctxt->src, ctxt->dst, ctxt->eflags);
return X86EMUL_CONTINUE;
}
-static int em_imul(struct x86_emulate_ctxt *ctxt)
+static int em_and(struct x86_emulate_ctxt *ctxt)
+{
+ emulate_2op_SrcV("and", ctxt->src, ctxt->dst, ctxt->eflags);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_sub(struct x86_emulate_ctxt *ctxt)
{
- struct decode_cache *c = &ctxt->decode;
+ emulate_2op_SrcV("sub", ctxt->src, ctxt->dst, ctxt->eflags);
+ return X86EMUL_CONTINUE;
+}
- emulate_2op_SrcV_nobyte("imul", c->src, c->dst, ctxt->eflags);
+static int em_xor(struct x86_emulate_ctxt *ctxt)
+{
+ emulate_2op_SrcV("xor", ctxt->src, ctxt->dst, ctxt->eflags);
return X86EMUL_CONTINUE;
}
-static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
+static int em_cmp(struct x86_emulate_ctxt *ctxt)
+{
+ emulate_2op_SrcV("cmp", ctxt->src, ctxt->dst, ctxt->eflags);
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_test(struct x86_emulate_ctxt *ctxt)
{
- struct decode_cache *c = &ctxt->decode;
+ emulate_2op_SrcV("test", ctxt->src, ctxt->dst, ctxt->eflags);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_xchg(struct x86_emulate_ctxt *ctxt)
+{
+ /* Write back the register source. */
+ ctxt->src.val = ctxt->dst.val;
+ write_register_operand(&ctxt->src);
+
+ /* Write back the memory destination with implicit LOCK prefix. */
+ ctxt->dst.val = ctxt->src.orig_val;
+ ctxt->lock_prefix = 1;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_imul(struct x86_emulate_ctxt *ctxt)
+{
+ emulate_2op_SrcV_nobyte("imul", ctxt->src, ctxt->dst, ctxt->eflags);
+ return X86EMUL_CONTINUE;
+}
- c->dst.val = c->src2.val;
+static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->dst.val = ctxt->src2.val;
return em_imul(ctxt);
}
static int em_cwd(struct x86_emulate_ctxt *ctxt)
{
- struct decode_cache *c = &ctxt->decode;
-
- c->dst.type = OP_REG;
- c->dst.bytes = c->src.bytes;
- c->dst.addr.reg = &c->regs[VCPU_REGS_RDX];
- c->dst.val = ~((c->src.val >> (c->src.bytes * 8 - 1)) - 1);
+ ctxt->dst.type = OP_REG;
+ ctxt->dst.bytes = ctxt->src.bytes;
+ ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX];
+ ctxt->dst.val = ~((ctxt->src.val >> (ctxt->src.bytes * 8 - 1)) - 1);
return X86EMUL_CONTINUE;
}
static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
{
- unsigned cpl = ctxt->ops->cpl(ctxt->vcpu);
- struct decode_cache *c = &ctxt->decode;
u64 tsc = 0;
- if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD))
- return emulate_gp(ctxt, 0);
- ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc);
- c->regs[VCPU_REGS_RAX] = (u32)tsc;
- c->regs[VCPU_REGS_RDX] = tsc >> 32;
+ ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc);
+ ctxt->regs[VCPU_REGS_RAX] = (u32)tsc;
+ ctxt->regs[VCPU_REGS_RDX] = tsc >> 32;
return X86EMUL_CONTINUE;
}
static int em_mov(struct x86_emulate_ctxt *ctxt)
{
- struct decode_cache *c = &ctxt->decode;
- c->dst.val = c->src.val;
+ ctxt->dst.val = ctxt->src.val;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt)
+{
+ if (ctxt->modrm_reg > VCPU_SREG_GS)
+ return emulate_ud(ctxt);
+
+ ctxt->dst.val = get_segment_selector(ctxt, ctxt->modrm_reg);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt)
+{
+ u16 sel = ctxt->src.val;
+
+ if (ctxt->modrm_reg == VCPU_SREG_CS || ctxt->modrm_reg > VCPU_SREG_GS)
+ return emulate_ud(ctxt);
+
+ if (ctxt->modrm_reg == VCPU_SREG_SS)
+ ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
+
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg);
+}
+
+static int em_movdqu(struct x86_emulate_ctxt *ctxt)
+{
+ memcpy(&ctxt->dst.vec_val, &ctxt->src.vec_val, ctxt->op_bytes);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_invlpg(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, ctxt->src.addr.mem, 1, false, &linear);
+ if (rc == X86EMUL_CONTINUE)
+ ctxt->ops->invlpg(ctxt, linear);
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_clts(struct x86_emulate_ctxt *ctxt)
+{
+ ulong cr0;
+
+ cr0 = ctxt->ops->get_cr(ctxt, 0);
+ cr0 &= ~X86_CR0_TS;
+ ctxt->ops->set_cr(ctxt, 0, cr0);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_vmcall(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+
+ if (ctxt->modrm_mod != 3 || ctxt->modrm_rm != 1)
+ return X86EMUL_UNHANDLEABLE;
+
+ rc = ctxt->ops->fix_hypercall(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ /* Let the processor re-execute the fixed hypercall */
+ ctxt->_eip = ctxt->eip;
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_lgdt(struct x86_emulate_ctxt *ctxt)
+{
+ struct desc_ptr desc_ptr;
+ int rc;
+
+ rc = read_descriptor(ctxt, ctxt->src.addr.mem,
+ &desc_ptr.size, &desc_ptr.address,
+ ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ ctxt->ops->set_gdt(ctxt, &desc_ptr);
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_vmmcall(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+
+ rc = ctxt->ops->fix_hypercall(ctxt);
+
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return rc;
+}
+
+static int em_lidt(struct x86_emulate_ctxt *ctxt)
+{
+ struct desc_ptr desc_ptr;
+ int rc;
+
+ rc = read_descriptor(ctxt, ctxt->src.addr.mem,
+ &desc_ptr.size, &desc_ptr.address,
+ ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ ctxt->ops->set_idt(ctxt, &desc_ptr);
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_smsw(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->dst.bytes = 2;
+ ctxt->dst.val = ctxt->ops->get_cr(ctxt, 0);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_lmsw(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul)
+ | (ctxt->src.val & 0x0f));
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_loop(struct x86_emulate_ctxt *ctxt)
+{
+ register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1);
+ if ((address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) != 0) &&
+ (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags)))
+ jmp_rel(ctxt, ctxt->src.val);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_jcxz(struct x86_emulate_ctxt *ctxt)
+{
+ if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0)
+ jmp_rel(ctxt, ctxt->src.val);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_cli(struct x86_emulate_ctxt *ctxt)
+{
+ if (emulator_bad_iopl(ctxt))
+ return emulate_gp(ctxt, 0);
+
+ ctxt->eflags &= ~X86_EFLAGS_IF;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_sti(struct x86_emulate_ctxt *ctxt)
+{
+ if (emulator_bad_iopl(ctxt))
+ return emulate_gp(ctxt, 0);
+
+ ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
+ ctxt->eflags |= X86_EFLAGS_IF;
+ return X86EMUL_CONTINUE;
+}
+
+static bool valid_cr(int nr)
+{
+ switch (nr) {
+ case 0:
+ case 2 ... 4:
+ case 8:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static int check_cr_read(struct x86_emulate_ctxt *ctxt)
+{
+ if (!valid_cr(ctxt->modrm_reg))
+ return emulate_ud(ctxt);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_cr_write(struct x86_emulate_ctxt *ctxt)
+{
+ u64 new_val = ctxt->src.val64;
+ int cr = ctxt->modrm_reg;
+ u64 efer = 0;
+
+ static u64 cr_reserved_bits[] = {
+ 0xffffffff00000000ULL,
+ 0, 0, 0, /* CR3 checked later */
+ CR4_RESERVED_BITS,
+ 0, 0, 0,
+ CR8_RESERVED_BITS,
+ };
+
+ if (!valid_cr(cr))
+ return emulate_ud(ctxt);
+
+ if (new_val & cr_reserved_bits[cr])
+ return emulate_gp(ctxt, 0);
+
+ switch (cr) {
+ case 0: {
+ u64 cr4;
+ if (((new_val & X86_CR0_PG) && !(new_val & X86_CR0_PE)) ||
+ ((new_val & X86_CR0_NW) && !(new_val & X86_CR0_CD)))
+ return emulate_gp(ctxt, 0);
+
+ cr4 = ctxt->ops->get_cr(ctxt, 4);
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+
+ if ((new_val & X86_CR0_PG) && (efer & EFER_LME) &&
+ !(cr4 & X86_CR4_PAE))
+ return emulate_gp(ctxt, 0);
+
+ break;
+ }
+ case 3: {
+ u64 rsvd = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+ if (efer & EFER_LMA)
+ rsvd = CR3_L_MODE_RESERVED_BITS;
+ else if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PAE)
+ rsvd = CR3_PAE_RESERVED_BITS;
+ else if (ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PG)
+ rsvd = CR3_NONPAE_RESERVED_BITS;
+
+ if (new_val & rsvd)
+ return emulate_gp(ctxt, 0);
+
+ break;
+ }
+ case 4: {
+ u64 cr4;
+
+ cr4 = ctxt->ops->get_cr(ctxt, 4);
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+
+ if ((efer & EFER_LMA) && !(new_val & X86_CR4_PAE))
+ return emulate_gp(ctxt, 0);
+
+ break;
+ }
+ }
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_dr7_gd(struct x86_emulate_ctxt *ctxt)
+{
+ unsigned long dr7;
+
+ ctxt->ops->get_dr(ctxt, 7, &dr7);
+
+ /* Check if DR7.Global_Enable is set */
+ return dr7 & (1 << 13);
+}
+
+static int check_dr_read(struct x86_emulate_ctxt *ctxt)
+{
+ int dr = ctxt->modrm_reg;
+ u64 cr4;
+
+ if (dr > 7)
+ return emulate_ud(ctxt);
+
+ cr4 = ctxt->ops->get_cr(ctxt, 4);
+ if ((cr4 & X86_CR4_DE) && (dr == 4 || dr == 5))
+ return emulate_ud(ctxt);
+
+ if (check_dr7_gd(ctxt))
+ return emulate_db(ctxt);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_dr_write(struct x86_emulate_ctxt *ctxt)
+{
+ u64 new_val = ctxt->src.val64;
+ int dr = ctxt->modrm_reg;
+
+ if ((dr == 6 || dr == 7) && (new_val & 0xffffffff00000000ULL))
+ return emulate_gp(ctxt, 0);
+
+ return check_dr_read(ctxt);
+}
+
+static int check_svme(struct x86_emulate_ctxt *ctxt)
+{
+ u64 efer;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+
+ if (!(efer & EFER_SVME))
+ return emulate_ud(ctxt);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_svme_pa(struct x86_emulate_ctxt *ctxt)
+{
+ u64 rax = ctxt->regs[VCPU_REGS_RAX];
+
+ /* Valid physical address? */
+ if (rax & 0xffff000000000000ULL)
+ return emulate_gp(ctxt, 0);
+
+ return check_svme(ctxt);
+}
+
+static int check_rdtsc(struct x86_emulate_ctxt *ctxt)
+{
+ u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
+
+ if (cr4 & X86_CR4_TSD && ctxt->ops->cpl(ctxt))
+ return emulate_ud(ctxt);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
+{
+ u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
+ u64 rcx = ctxt->regs[VCPU_REGS_RCX];
+
+ if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
+ (rcx > 3))
+ return emulate_gp(ctxt, 0);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_perm_in(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->dst.bytes = min(ctxt->dst.bytes, 4u);
+ if (!emulator_io_permited(ctxt, ctxt->src.val, ctxt->dst.bytes))
+ return emulate_gp(ctxt, 0);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_perm_out(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->src.bytes = min(ctxt->src.bytes, 4u);
+ if (!emulator_io_permited(ctxt, ctxt->dst.val, ctxt->src.bytes))
+ return emulate_gp(ctxt, 0);
+
return X86EMUL_CONTINUE;
}
#define D(_y) { .flags = (_y) }
+#define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i }
+#define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \
+ .check_perm = (_p) }
#define N D(0)
+#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
-#define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) }
+#define GD(_f, _g) { .flags = ((_f) | GroupDual), .u.gdual = (_g) }
#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
+#define II(_f, _e, _i) \
+ { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i }
+#define IIP(_f, _e, _i, _p) \
+ { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i, \
+ .check_perm = (_p) }
+#define GP(_f, _g) { .flags = ((_f) | Prefix), .u.gprefix = (_g) }
#define D2bv(_f) D((_f) | ByteOp), D(_f)
+#define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p)
#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e)
-#define D6ALU(_f) D2bv((_f) | DstMem | SrcReg | ModRM), \
- D2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock), \
- D2bv(((_f) & ~Lock) | DstAcc | SrcImm)
+#define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e), \
+ I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \
+ I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
+
+static struct opcode group7_rm1[] = {
+ DI(SrcNone | ModRM | Priv, monitor),
+ DI(SrcNone | ModRM | Priv, mwait),
+ N, N, N, N, N, N,
+};
+
+static struct opcode group7_rm3[] = {
+ DIP(SrcNone | ModRM | Prot | Priv, vmrun, check_svme_pa),
+ II(SrcNone | ModRM | Prot | VendorSpecific, em_vmmcall, vmmcall),
+ DIP(SrcNone | ModRM | Prot | Priv, vmload, check_svme_pa),
+ DIP(SrcNone | ModRM | Prot | Priv, vmsave, check_svme_pa),
+ DIP(SrcNone | ModRM | Prot | Priv, stgi, check_svme),
+ DIP(SrcNone | ModRM | Prot | Priv, clgi, check_svme),
+ DIP(SrcNone | ModRM | Prot | Priv, skinit, check_svme),
+ DIP(SrcNone | ModRM | Prot | Priv, invlpga, check_svme),
+};
+static struct opcode group7_rm7[] = {
+ N,
+ DIP(SrcNone | ModRM, rdtscp, check_rdtsc),
+ N, N, N, N, N, N,
+};
static struct opcode group1[] = {
- X7(D(Lock)), N
+ I(Lock, em_add),
+ I(Lock, em_or),
+ I(Lock, em_adc),
+ I(Lock, em_sbb),
+ I(Lock, em_and),
+ I(Lock, em_sub),
+ I(Lock, em_xor),
+ I(0, em_cmp),
};
static struct opcode group1A[] = {
@@ -2366,16 +3043,28 @@ static struct opcode group5[] = {
D(SrcMem | ModRM | Stack), N,
};
+static struct opcode group6[] = {
+ DI(ModRM | Prot, sldt),
+ DI(ModRM | Prot, str),
+ DI(ModRM | Prot | Priv, lldt),
+ DI(ModRM | Prot | Priv, ltr),
+ N, N, N, N,
+};
+
static struct group_dual group7 = { {
- N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv),
- D(SrcNone | ModRM | DstMem | Mov), N,
- D(SrcMem16 | ModRM | Mov | Priv),
- D(SrcMem | ModRM | ByteOp | Priv | NoAccess),
+ DI(ModRM | Mov | DstMem | Priv, sgdt),
+ DI(ModRM | Mov | DstMem | Priv, sidt),
+ II(ModRM | SrcMem | Priv, em_lgdt, lgdt),
+ II(ModRM | SrcMem | Priv, em_lidt, lidt),
+ II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N,
+ II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw),
+ II(SrcMem | ModRM | ByteOp | Priv | NoAccess, em_invlpg, invlpg),
}, {
- D(SrcNone | ModRM | Priv | VendorSpecific), N,
- N, D(SrcNone | ModRM | Priv | VendorSpecific),
- D(SrcNone | ModRM | DstMem | Mov), N,
- D(SrcMem16 | ModRM | Mov | Priv), N,
+ I(SrcNone | ModRM | Priv | VendorSpecific, em_vmcall),
+ EXT(0, group7_rm1),
+ N, EXT(0, group7_rm3),
+ II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N,
+ II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), EXT(0, group7_rm7),
} };
static struct opcode group8[] = {
@@ -2394,35 +3083,40 @@ static struct opcode group11[] = {
I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)),
};
+static struct gprefix pfx_0f_6f_0f_7f = {
+ N, N, N, I(Sse, em_movdqu),
+};
+
static struct opcode opcode_table[256] = {
/* 0x00 - 0x07 */
- D6ALU(Lock),
+ I6ALU(Lock, em_add),
D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
/* 0x08 - 0x0F */
- D6ALU(Lock),
+ I6ALU(Lock, em_or),
D(ImplicitOps | Stack | No64), N,
/* 0x10 - 0x17 */
- D6ALU(Lock),
+ I6ALU(Lock, em_adc),
D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
/* 0x18 - 0x1F */
- D6ALU(Lock),
+ I6ALU(Lock, em_sbb),
D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
/* 0x20 - 0x27 */
- D6ALU(Lock), N, N,
+ I6ALU(Lock, em_and), N, N,
/* 0x28 - 0x2F */
- D6ALU(Lock), N, I(ByteOp | DstAcc | No64, em_das),
+ I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das),
/* 0x30 - 0x37 */
- D6ALU(Lock), N, N,
+ I6ALU(Lock, em_xor), N, N,
/* 0x38 - 0x3F */
- D6ALU(0), N, N,
+ I6ALU(0, em_cmp), N, N,
/* 0x40 - 0x4F */
X16(D(DstReg)),
/* 0x50 - 0x57 */
X8(I(SrcReg | Stack, em_push)),
/* 0x58 - 0x5F */
- X8(D(DstReg | Stack)),
+ X8(I(DstReg | Stack, em_pop)),
/* 0x60 - 0x67 */
- D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
+ I(ImplicitOps | Stack | No64, em_pusha),
+ I(ImplicitOps | Stack | No64, em_popa),
N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ ,
N, N, N, N,
/* 0x68 - 0x6F */
@@ -2430,8 +3124,8 @@ static struct opcode opcode_table[256] = {
I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
I(SrcImmByte | Mov | Stack, em_push),
I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
- D2bv(DstDI | Mov | String), /* insb, insw/insd */
- D2bv(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */
+ D2bvIP(DstDI | SrcDX | Mov | String, ins, check_perm_in), /* insb, insw/insd */
+ D2bvIP(SrcSI | DstDX | String, outs, check_perm_out), /* outsb, outsw/outsd */
/* 0x70 - 0x7F */
X16(D(SrcImmByte)),
/* 0x80 - 0x87 */
@@ -2439,28 +3133,32 @@ static struct opcode opcode_table[256] = {
G(DstMem | SrcImm | ModRM | Group, group1),
G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1),
G(DstMem | SrcImmByte | ModRM | Group, group1),
- D2bv(DstMem | SrcReg | ModRM), D2bv(DstMem | SrcReg | ModRM | Lock),
+ I2bv(DstMem | SrcReg | ModRM, em_test),
+ I2bv(DstMem | SrcReg | ModRM | Lock, em_xchg),
/* 0x88 - 0x8F */
I2bv(DstMem | SrcReg | ModRM | Mov, em_mov),
I2bv(DstReg | SrcMem | ModRM | Mov, em_mov),
- D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg),
- D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A),
+ I(DstMem | SrcNone | ModRM | Mov, em_mov_rm_sreg),
+ D(ModRM | SrcMem | NoAccess | DstReg),
+ I(ImplicitOps | SrcMem16 | ModRM, em_mov_sreg_rm),
+ G(0, group1A),
/* 0x90 - 0x97 */
- X8(D(SrcAcc | DstReg)),
+ DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)),
/* 0x98 - 0x9F */
D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
I(SrcImmFAddr | No64, em_call_far), N,
- D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N,
+ II(ImplicitOps | Stack, em_pushf, pushf),
+ II(ImplicitOps | Stack, em_popf, popf), N, N,
/* 0xA0 - 0xA7 */
I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov),
I2bv(SrcSI | DstDI | Mov | String, em_mov),
- D2bv(SrcSI | DstDI | String),
+ I2bv(SrcSI | DstDI | String, em_cmp),
/* 0xA8 - 0xAF */
- D2bv(DstAcc | SrcImm),
+ I2bv(DstAcc | SrcImm, em_test),
I2bv(SrcAcc | DstDI | Mov | String, em_mov),
I2bv(SrcSI | DstAcc | Mov | String, em_mov),
- D2bv(SrcAcc | DstDI | String),
+ I2bv(SrcAcc | DstDI | String, em_cmp),
/* 0xB0 - 0xB7 */
X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)),
/* 0xB8 - 0xBF */
@@ -2468,49 +3166,61 @@ static struct opcode opcode_table[256] = {
/* 0xC0 - 0xC7 */
D2bv(DstMem | SrcImmByte | ModRM),
I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
- D(ImplicitOps | Stack),
+ I(ImplicitOps | Stack, em_ret),
D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64),
G(ByteOp, group11), G(0, group11),
/* 0xC8 - 0xCF */
- N, N, N, D(ImplicitOps | Stack),
- D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps),
+ N, N, N, I(ImplicitOps | Stack, em_ret_far),
+ D(ImplicitOps), DI(SrcImmByte, intn),
+ D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret),
/* 0xD0 - 0xD7 */
D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM),
N, N, N, N,
/* 0xD8 - 0xDF */
N, N, N, N, N, N, N, N,
/* 0xE0 - 0xE7 */
- X4(D(SrcImmByte)),
- D2bv(SrcImmUByte | DstAcc), D2bv(SrcAcc | DstImmUByte),
+ X3(I(SrcImmByte, em_loop)),
+ I(SrcImmByte, em_jcxz),
+ D2bvIP(SrcImmUByte | DstAcc, in, check_perm_in),
+ D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out),
/* 0xE8 - 0xEF */
D(SrcImm | Stack), D(SrcImm | ImplicitOps),
- D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps),
- D2bv(SrcNone | DstAcc), D2bv(SrcAcc | ImplicitOps),
+ I(SrcImmFAddr | No64, em_jmp_far), D(SrcImmByte | ImplicitOps),
+ D2bvIP(SrcDX | DstAcc, in, check_perm_in),
+ D2bvIP(SrcAcc | DstDX, out, check_perm_out),
/* 0xF0 - 0xF7 */
- N, N, N, N,
- D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3),
+ N, DI(ImplicitOps, icebp), N, N,
+ DI(ImplicitOps | Priv, hlt), D(ImplicitOps),
+ G(ByteOp, group3), G(0, group3),
/* 0xF8 - 0xFF */
- D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps),
+ D(ImplicitOps), D(ImplicitOps),
+ I(ImplicitOps, em_cli), I(ImplicitOps, em_sti),
D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
};
static struct opcode twobyte_table[256] = {
/* 0x00 - 0x0F */
- N, GD(0, &group7), N, N,
- N, D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv), N,
- D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N,
+ G(0, group6), GD(0, &group7), N, N,
+ N, I(ImplicitOps | VendorSpecific, em_syscall),
+ II(ImplicitOps | Priv, em_clts, clts), N,
+ DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N,
N, D(ImplicitOps | ModRM), N, N,
/* 0x10 - 0x1F */
N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N,
/* 0x20 - 0x2F */
- D(ModRM | DstMem | Priv | Op3264), D(ModRM | DstMem | Priv | Op3264),
- D(ModRM | SrcMem | Priv | Op3264), D(ModRM | SrcMem | Priv | Op3264),
+ DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read),
+ DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read),
+ DIP(ModRM | SrcMem | Priv | Op3264, cr_write, check_cr_write),
+ DIP(ModRM | SrcMem | Priv | Op3264, dr_write, check_dr_write),
N, N, N, N,
N, N, N, N, N, N, N, N,
/* 0x30 - 0x3F */
- D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc),
- D(ImplicitOps | Priv), N,
- D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific),
+ DI(ImplicitOps | Priv, wrmsr),
+ IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
+ DI(ImplicitOps | Priv, rdmsr),
+ DIP(ImplicitOps | Priv, rdpmc, check_rdpmc),
+ I(ImplicitOps | VendorSpecific, em_sysenter),
+ I(ImplicitOps | Priv | VendorSpecific, em_sysexit),
N, N,
N, N, N, N, N, N, N, N,
/* 0x40 - 0x4F */
@@ -2518,21 +3228,27 @@ static struct opcode twobyte_table[256] = {
/* 0x50 - 0x5F */
N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
/* 0x60 - 0x6F */
- N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+ N, N, N, N,
+ N, N, N, N,
+ N, N, N, N,
+ N, N, N, GP(SrcMem | DstReg | ModRM | Mov, &pfx_0f_6f_0f_7f),
/* 0x70 - 0x7F */
- N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+ N, N, N, N,
+ N, N, N, N,
+ N, N, N, N,
+ N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_6f_0f_7f),
/* 0x80 - 0x8F */
X16(D(SrcImm)),
/* 0x90 - 0x9F */
X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
/* 0xA0 - 0xA7 */
D(ImplicitOps | Stack), D(ImplicitOps | Stack),
- N, D(DstMem | SrcReg | ModRM | BitOp),
+ DI(ImplicitOps, cpuid), D(DstMem | SrcReg | ModRM | BitOp),
D(DstMem | SrcReg | Src2ImmByte | ModRM),
D(DstMem | SrcReg | Src2CL | ModRM), N, N,
/* 0xA8 - 0xAF */
D(ImplicitOps | Stack), D(ImplicitOps | Stack),
- N, D(DstMem | SrcReg | ModRM | BitOp | Lock),
+ DI(ImplicitOps, rsm), D(DstMem | SrcReg | ModRM | BitOp | Lock),
D(DstMem | SrcReg | Src2ImmByte | ModRM),
D(DstMem | SrcReg | Src2CL | ModRM),
D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
@@ -2564,16 +3280,19 @@ static struct opcode twobyte_table[256] = {
#undef G
#undef GD
#undef I
+#undef GP
+#undef EXT
#undef D2bv
+#undef D2bvIP
#undef I2bv
-#undef D6ALU
+#undef I6ALU
-static unsigned imm_size(struct decode_cache *c)
+static unsigned imm_size(struct x86_emulate_ctxt *ctxt)
{
unsigned size;
- size = (c->d & ByteOp) ? 1 : c->op_bytes;
+ size = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
if (size == 8)
size = 4;
return size;
@@ -2582,23 +3301,21 @@ static unsigned imm_size(struct decode_cache *c)
static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
unsigned size, bool sign_extension)
{
- struct decode_cache *c = &ctxt->decode;
- struct x86_emulate_ops *ops = ctxt->ops;
int rc = X86EMUL_CONTINUE;
op->type = OP_IMM;
op->bytes = size;
- op->addr.mem.ea = c->eip;
+ op->addr.mem.ea = ctxt->_eip;
/* NB. Immediates are sign-extended as necessary. */
switch (op->bytes) {
case 1:
- op->val = insn_fetch(s8, 1, c->eip);
+ op->val = insn_fetch(s8, 1, ctxt->_eip);
break;
case 2:
- op->val = insn_fetch(s16, 2, c->eip);
+ op->val = insn_fetch(s16, 2, ctxt->_eip);
break;
case 4:
- op->val = insn_fetch(s32, 4, c->eip);
+ op->val = insn_fetch(s32, 4, ctxt->_eip);
break;
}
if (!sign_extension) {
@@ -2618,23 +3335,20 @@ done:
return rc;
}
-int
-x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
{
- struct x86_emulate_ops *ops = ctxt->ops;
- struct decode_cache *c = &ctxt->decode;
int rc = X86EMUL_CONTINUE;
int mode = ctxt->mode;
- int def_op_bytes, def_ad_bytes, dual, goffset;
- struct opcode opcode, *g_mod012, *g_mod3;
- struct operand memop = { .type = OP_NONE };
-
- c->eip = ctxt->eip;
- c->fetch.start = c->eip;
- c->fetch.end = c->fetch.start + insn_len;
+ int def_op_bytes, def_ad_bytes, goffset, simd_prefix;
+ bool op_prefix = false;
+ struct opcode opcode;
+ struct operand memop = { .type = OP_NONE }, *memopp = NULL;
+
+ ctxt->_eip = ctxt->eip;
+ ctxt->fetch.start = ctxt->_eip;
+ ctxt->fetch.end = ctxt->fetch.start + insn_len;
if (insn_len > 0)
- memcpy(c->fetch.data, insn, insn_len);
- ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
+ memcpy(ctxt->fetch.data, insn, insn_len);
switch (mode) {
case X86EMUL_MODE_REAL:
@@ -2655,47 +3369,46 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
return -1;
}
- c->op_bytes = def_op_bytes;
- c->ad_bytes = def_ad_bytes;
+ ctxt->op_bytes = def_op_bytes;
+ ctxt->ad_bytes = def_ad_bytes;
/* Legacy prefixes. */
for (;;) {
- switch (c->b = insn_fetch(u8, 1, c->eip)) {
+ switch (ctxt->b = insn_fetch(u8, 1, ctxt->_eip)) {
case 0x66: /* operand-size override */
+ op_prefix = true;
/* switch between 2/4 bytes */
- c->op_bytes = def_op_bytes ^ 6;
+ ctxt->op_bytes = def_op_bytes ^ 6;
break;
case 0x67: /* address-size override */
if (mode == X86EMUL_MODE_PROT64)
/* switch between 4/8 bytes */
- c->ad_bytes = def_ad_bytes ^ 12;
+ ctxt->ad_bytes = def_ad_bytes ^ 12;
else
/* switch between 2/4 bytes */
- c->ad_bytes = def_ad_bytes ^ 6;
+ ctxt->ad_bytes = def_ad_bytes ^ 6;
break;
case 0x26: /* ES override */
case 0x2e: /* CS override */
case 0x36: /* SS override */
case 0x3e: /* DS override */
- set_seg_override(c, (c->b >> 3) & 3);
+ set_seg_override(ctxt, (ctxt->b >> 3) & 3);
break;
case 0x64: /* FS override */
case 0x65: /* GS override */
- set_seg_override(c, c->b & 7);
+ set_seg_override(ctxt, ctxt->b & 7);
break;
case 0x40 ... 0x4f: /* REX */
if (mode != X86EMUL_MODE_PROT64)
goto done_prefixes;
- c->rex_prefix = c->b;
+ ctxt->rex_prefix = ctxt->b;
continue;
case 0xf0: /* LOCK */
- c->lock_prefix = 1;
+ ctxt->lock_prefix = 1;
break;
case 0xf2: /* REPNE/REPNZ */
- c->rep_prefix = REPNE_PREFIX;
- break;
case 0xf3: /* REP/REPE/REPZ */
- c->rep_prefix = REPE_PREFIX;
+ ctxt->rep_prefix = ctxt->b;
break;
default:
goto done_prefixes;
@@ -2703,96 +3416,116 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
/* Any legacy prefix after a REX prefix nullifies its effect. */
- c->rex_prefix = 0;
+ ctxt->rex_prefix = 0;
}
done_prefixes:
/* REX prefix. */
- if (c->rex_prefix & 8)
- c->op_bytes = 8; /* REX.W */
+ if (ctxt->rex_prefix & 8)
+ ctxt->op_bytes = 8; /* REX.W */
/* Opcode byte(s). */
- opcode = opcode_table[c->b];
+ opcode = opcode_table[ctxt->b];
/* Two-byte opcode? */
- if (c->b == 0x0f) {
- c->twobyte = 1;
- c->b = insn_fetch(u8, 1, c->eip);
- opcode = twobyte_table[c->b];
+ if (ctxt->b == 0x0f) {
+ ctxt->twobyte = 1;
+ ctxt->b = insn_fetch(u8, 1, ctxt->_eip);
+ opcode = twobyte_table[ctxt->b];
}
- c->d = opcode.flags;
-
- if (c->d & Group) {
- dual = c->d & GroupDual;
- c->modrm = insn_fetch(u8, 1, c->eip);
- --c->eip;
-
- if (c->d & GroupDual) {
- g_mod012 = opcode.u.gdual->mod012;
- g_mod3 = opcode.u.gdual->mod3;
- } else
- g_mod012 = g_mod3 = opcode.u.group;
-
- c->d &= ~(Group | GroupDual);
-
- goffset = (c->modrm >> 3) & 7;
+ ctxt->d = opcode.flags;
+
+ while (ctxt->d & GroupMask) {
+ switch (ctxt->d & GroupMask) {
+ case Group:
+ ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip);
+ --ctxt->_eip;
+ goffset = (ctxt->modrm >> 3) & 7;
+ opcode = opcode.u.group[goffset];
+ break;
+ case GroupDual:
+ ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip);
+ --ctxt->_eip;
+ goffset = (ctxt->modrm >> 3) & 7;
+ if ((ctxt->modrm >> 6) == 3)
+ opcode = opcode.u.gdual->mod3[goffset];
+ else
+ opcode = opcode.u.gdual->mod012[goffset];
+ break;
+ case RMExt:
+ goffset = ctxt->modrm & 7;
+ opcode = opcode.u.group[goffset];
+ break;
+ case Prefix:
+ if (ctxt->rep_prefix && op_prefix)
+ return X86EMUL_UNHANDLEABLE;
+ simd_prefix = op_prefix ? 0x66 : ctxt->rep_prefix;
+ switch (simd_prefix) {
+ case 0x00: opcode = opcode.u.gprefix->pfx_no; break;
+ case 0x66: opcode = opcode.u.gprefix->pfx_66; break;
+ case 0xf2: opcode = opcode.u.gprefix->pfx_f2; break;
+ case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break;
+ }
+ break;
+ default:
+ return X86EMUL_UNHANDLEABLE;
+ }
- if ((c->modrm >> 6) == 3)
- opcode = g_mod3[goffset];
- else
- opcode = g_mod012[goffset];
- c->d |= opcode.flags;
+ ctxt->d &= ~GroupMask;
+ ctxt->d |= opcode.flags;
}
- c->execute = opcode.u.execute;
+ ctxt->execute = opcode.u.execute;
+ ctxt->check_perm = opcode.check_perm;
+ ctxt->intercept = opcode.intercept;
/* Unrecognised? */
- if (c->d == 0 || (c->d & Undefined))
+ if (ctxt->d == 0 || (ctxt->d & Undefined))
return -1;
- if (!(c->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
+ if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
return -1;
- if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
- c->op_bytes = 8;
+ if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack))
+ ctxt->op_bytes = 8;
- if (c->d & Op3264) {
+ if (ctxt->d & Op3264) {
if (mode == X86EMUL_MODE_PROT64)
- c->op_bytes = 8;
+ ctxt->op_bytes = 8;
else
- c->op_bytes = 4;
+ ctxt->op_bytes = 4;
}
+ if (ctxt->d & Sse)
+ ctxt->op_bytes = 16;
+
/* ModRM and SIB bytes. */
- if (c->d & ModRM) {
- rc = decode_modrm(ctxt, ops, &memop);
- if (!c->has_seg_override)
- set_seg_override(c, c->modrm_seg);
- } else if (c->d & MemAbs)
- rc = decode_abs(ctxt, ops, &memop);
+ if (ctxt->d & ModRM) {
+ rc = decode_modrm(ctxt, &memop);
+ if (!ctxt->has_seg_override)
+ set_seg_override(ctxt, ctxt->modrm_seg);
+ } else if (ctxt->d & MemAbs)
+ rc = decode_abs(ctxt, &memop);
if (rc != X86EMUL_CONTINUE)
goto done;
- if (!c->has_seg_override)
- set_seg_override(c, VCPU_SREG_DS);
+ if (!ctxt->has_seg_override)
+ set_seg_override(ctxt, VCPU_SREG_DS);
- memop.addr.mem.seg = seg_override(ctxt, ops, c);
+ memop.addr.mem.seg = seg_override(ctxt);
- if (memop.type == OP_MEM && c->ad_bytes != 8)
+ if (memop.type == OP_MEM && ctxt->ad_bytes != 8)
memop.addr.mem.ea = (u32)memop.addr.mem.ea;
- if (memop.type == OP_MEM && c->rip_relative)
- memop.addr.mem.ea += c->eip;
-
/*
* Decode and fetch the source operand: register, memory
* or immediate.
*/
- switch (c->d & SrcMask) {
+ switch (ctxt->d & SrcMask) {
case SrcNone:
break;
case SrcReg:
- decode_register_operand(&c->src, c, 0);
+ decode_register_operand(ctxt, &ctxt->src, 0);
break;
case SrcMem16:
memop.bytes = 2;
@@ -2801,54 +3534,61 @@ done_prefixes:
memop.bytes = 4;
goto srcmem_common;
case SrcMem:
- memop.bytes = (c->d & ByteOp) ? 1 :
- c->op_bytes;
+ memop.bytes = (ctxt->d & ByteOp) ? 1 :
+ ctxt->op_bytes;
srcmem_common:
- c->src = memop;
+ ctxt->src = memop;
+ memopp = &ctxt->src;
break;
case SrcImmU16:
- rc = decode_imm(ctxt, &c->src, 2, false);
+ rc = decode_imm(ctxt, &ctxt->src, 2, false);
break;
case SrcImm:
- rc = decode_imm(ctxt, &c->src, imm_size(c), true);
+ rc = decode_imm(ctxt, &ctxt->src, imm_size(ctxt), true);
break;
case SrcImmU:
- rc = decode_imm(ctxt, &c->src, imm_size(c), false);
+ rc = decode_imm(ctxt, &ctxt->src, imm_size(ctxt), false);
break;
case SrcImmByte:
- rc = decode_imm(ctxt, &c->src, 1, true);
+ rc = decode_imm(ctxt, &ctxt->src, 1, true);
break;
case SrcImmUByte:
- rc = decode_imm(ctxt, &c->src, 1, false);
+ rc = decode_imm(ctxt, &ctxt->src, 1, false);
break;
case SrcAcc:
- c->src.type = OP_REG;
- c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->src.addr.reg = &c->regs[VCPU_REGS_RAX];
- fetch_register_operand(&c->src);
+ ctxt->src.type = OP_REG;
+ ctxt->src.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ ctxt->src.addr.reg = &ctxt->regs[VCPU_REGS_RAX];
+ fetch_register_operand(&ctxt->src);
break;
case SrcOne:
- c->src.bytes = 1;
- c->src.val = 1;
+ ctxt->src.bytes = 1;
+ ctxt->src.val = 1;
break;
case SrcSI:
- c->src.type = OP_MEM;
- c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->src.addr.mem.ea =
- register_address(c, c->regs[VCPU_REGS_RSI]);
- c->src.addr.mem.seg = seg_override(ctxt, ops, c),
- c->src.val = 0;
+ ctxt->src.type = OP_MEM;
+ ctxt->src.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ ctxt->src.addr.mem.ea =
+ register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]);
+ ctxt->src.addr.mem.seg = seg_override(ctxt);
+ ctxt->src.val = 0;
break;
case SrcImmFAddr:
- c->src.type = OP_IMM;
- c->src.addr.mem.ea = c->eip;
- c->src.bytes = c->op_bytes + 2;
- insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
+ ctxt->src.type = OP_IMM;
+ ctxt->src.addr.mem.ea = ctxt->_eip;
+ ctxt->src.bytes = ctxt->op_bytes + 2;
+ insn_fetch_arr(ctxt->src.valptr, ctxt->src.bytes, ctxt->_eip);
break;
case SrcMemFAddr:
- memop.bytes = c->op_bytes + 2;
+ memop.bytes = ctxt->op_bytes + 2;
goto srcmem_common;
break;
+ case SrcDX:
+ ctxt->src.type = OP_REG;
+ ctxt->src.bytes = 2;
+ ctxt->src.addr.reg = &ctxt->regs[VCPU_REGS_RDX];
+ fetch_register_operand(&ctxt->src);
+ break;
}
if (rc != X86EMUL_CONTINUE)
@@ -2858,22 +3598,22 @@ done_prefixes:
* Decode and fetch the second source operand: register, memory
* or immediate.
*/
- switch (c->d & Src2Mask) {
+ switch (ctxt->d & Src2Mask) {
case Src2None:
break;
case Src2CL:
- c->src2.bytes = 1;
- c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
+ ctxt->src2.bytes = 1;
+ ctxt->src2.val = ctxt->regs[VCPU_REGS_RCX] & 0x8;
break;
case Src2ImmByte:
- rc = decode_imm(ctxt, &c->src2, 1, true);
+ rc = decode_imm(ctxt, &ctxt->src2, 1, true);
break;
case Src2One:
- c->src2.bytes = 1;
- c->src2.val = 1;
+ ctxt->src2.bytes = 1;
+ ctxt->src2.val = 1;
break;
case Src2Imm:
- rc = decode_imm(ctxt, &c->src2, imm_size(c), true);
+ rc = decode_imm(ctxt, &ctxt->src2, imm_size(ctxt), true);
break;
}
@@ -2881,58 +3621,66 @@ done_prefixes:
goto done;
/* Decode and fetch the destination operand: register or memory. */
- switch (c->d & DstMask) {
+ switch (ctxt->d & DstMask) {
case DstReg:
- decode_register_operand(&c->dst, c,
- c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
+ decode_register_operand(ctxt, &ctxt->dst,
+ ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7));
break;
case DstImmUByte:
- c->dst.type = OP_IMM;
- c->dst.addr.mem.ea = c->eip;
- c->dst.bytes = 1;
- c->dst.val = insn_fetch(u8, 1, c->eip);
+ ctxt->dst.type = OP_IMM;
+ ctxt->dst.addr.mem.ea = ctxt->_eip;
+ ctxt->dst.bytes = 1;
+ ctxt->dst.val = insn_fetch(u8, 1, ctxt->_eip);
break;
case DstMem:
case DstMem64:
- c->dst = memop;
- if ((c->d & DstMask) == DstMem64)
- c->dst.bytes = 8;
+ ctxt->dst = memop;
+ memopp = &ctxt->dst;
+ if ((ctxt->d & DstMask) == DstMem64)
+ ctxt->dst.bytes = 8;
else
- c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- if (c->d & BitOp)
- fetch_bit_operand(c);
- c->dst.orig_val = c->dst.val;
+ ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ if (ctxt->d & BitOp)
+ fetch_bit_operand(ctxt);
+ ctxt->dst.orig_val = ctxt->dst.val;
break;
case DstAcc:
- c->dst.type = OP_REG;
- c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.addr.reg = &c->regs[VCPU_REGS_RAX];
- fetch_register_operand(&c->dst);
- c->dst.orig_val = c->dst.val;
+ ctxt->dst.type = OP_REG;
+ ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RAX];
+ fetch_register_operand(&ctxt->dst);
+ ctxt->dst.orig_val = ctxt->dst.val;
break;
case DstDI:
- c->dst.type = OP_MEM;
- c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.addr.mem.ea =
- register_address(c, c->regs[VCPU_REGS_RDI]);
- c->dst.addr.mem.seg = VCPU_SREG_ES;
- c->dst.val = 0;
+ ctxt->dst.type = OP_MEM;
+ ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ ctxt->dst.addr.mem.ea =
+ register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]);
+ ctxt->dst.addr.mem.seg = VCPU_SREG_ES;
+ ctxt->dst.val = 0;
+ break;
+ case DstDX:
+ ctxt->dst.type = OP_REG;
+ ctxt->dst.bytes = 2;
+ ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX];
+ fetch_register_operand(&ctxt->dst);
break;
case ImplicitOps:
/* Special instructions do their own operand decoding. */
default:
- c->dst.type = OP_NONE; /* Disable writeback. */
- return 0;
+ ctxt->dst.type = OP_NONE; /* Disable writeback. */
+ break;
}
done:
- return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+ if (memopp && memopp->type == OP_MEM && ctxt->rip_relative)
+ memopp->addr.mem.ea += ctxt->_eip;
+
+ return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
}
static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
{
- struct decode_cache *c = &ctxt->decode;
-
/* The second termination condition only applies for REPE
* and REPNE. Test if the repeat string operation prefix is
* REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
@@ -2940,333 +3688,231 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
* - if REPE/REPZ and ZF = 0 then done
* - if REPNE/REPNZ and ZF = 1 then done
*/
- if (((c->b == 0xa6) || (c->b == 0xa7) ||
- (c->b == 0xae) || (c->b == 0xaf))
- && (((c->rep_prefix == REPE_PREFIX) &&
+ if (((ctxt->b == 0xa6) || (ctxt->b == 0xa7) ||
+ (ctxt->b == 0xae) || (ctxt->b == 0xaf))
+ && (((ctxt->rep_prefix == REPE_PREFIX) &&
((ctxt->eflags & EFLG_ZF) == 0))
- || ((c->rep_prefix == REPNE_PREFIX) &&
+ || ((ctxt->rep_prefix == REPNE_PREFIX) &&
((ctxt->eflags & EFLG_ZF) == EFLG_ZF))))
return true;
return false;
}
-int
-x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
+int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
{
struct x86_emulate_ops *ops = ctxt->ops;
u64 msr_data;
- struct decode_cache *c = &ctxt->decode;
int rc = X86EMUL_CONTINUE;
- int saved_dst_type = c->dst.type;
- int irq; /* Used for int 3, int, and into */
+ int saved_dst_type = ctxt->dst.type;
- ctxt->decode.mem_read.pos = 0;
+ ctxt->mem_read.pos = 0;
- if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
+ if (ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) {
rc = emulate_ud(ctxt);
goto done;
}
/* LOCK prefix is allowed only with some instructions */
- if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
+ if (ctxt->lock_prefix && (!(ctxt->d & Lock) || ctxt->dst.type != OP_MEM)) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
+ if ((ctxt->d & SrcMask) == SrcMemFAddr && ctxt->src.type != OP_MEM) {
rc = emulate_ud(ctxt);
goto done;
}
- if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) {
+ if ((ctxt->d & Sse)
+ && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)
+ || !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
rc = emulate_ud(ctxt);
goto done;
}
+ if ((ctxt->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
+ rc = emulate_nm(ctxt);
+ goto done;
+ }
+
+ if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
+ rc = emulator_check_intercept(ctxt, ctxt->intercept,
+ X86_ICPT_PRE_EXCEPT);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
/* Privileged instruction can be executed only in CPL=0 */
- if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
+ if ((ctxt->d & Priv) && ops->cpl(ctxt)) {
rc = emulate_gp(ctxt, 0);
goto done;
}
- if (c->rep_prefix && (c->d & String)) {
+ /* Instruction can only be executed in protected mode */
+ if ((ctxt->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
+ /* Do instruction specific permission checks */
+ if (ctxt->check_perm) {
+ rc = ctxt->check_perm(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
+ rc = emulator_check_intercept(ctxt, ctxt->intercept,
+ X86_ICPT_POST_EXCEPT);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ if (ctxt->rep_prefix && (ctxt->d & String)) {
/* All REP prefixes have the same first termination condition */
- if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
- ctxt->eip = c->eip;
+ if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0) {
+ ctxt->eip = ctxt->_eip;
goto done;
}
}
- if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) {
- rc = read_emulated(ctxt, ops, linear(ctxt, c->src.addr.mem),
- c->src.valptr, c->src.bytes);
+ if ((ctxt->src.type == OP_MEM) && !(ctxt->d & NoAccess)) {
+ rc = segmented_read(ctxt, ctxt->src.addr.mem,
+ ctxt->src.valptr, ctxt->src.bytes);
if (rc != X86EMUL_CONTINUE)
goto done;
- c->src.orig_val64 = c->src.val64;
+ ctxt->src.orig_val64 = ctxt->src.val64;
}
- if (c->src2.type == OP_MEM) {
- rc = read_emulated(ctxt, ops, linear(ctxt, c->src2.addr.mem),
- &c->src2.val, c->src2.bytes);
+ if (ctxt->src2.type == OP_MEM) {
+ rc = segmented_read(ctxt, ctxt->src2.addr.mem,
+ &ctxt->src2.val, ctxt->src2.bytes);
if (rc != X86EMUL_CONTINUE)
goto done;
}
- if ((c->d & DstMask) == ImplicitOps)
+ if ((ctxt->d & DstMask) == ImplicitOps)
goto special_insn;
- if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
+ if ((ctxt->dst.type == OP_MEM) && !(ctxt->d & Mov)) {
/* optimisation - avoid slow emulated read if Mov */
- rc = read_emulated(ctxt, ops, linear(ctxt, c->dst.addr.mem),
- &c->dst.val, c->dst.bytes);
+ rc = segmented_read(ctxt, ctxt->dst.addr.mem,
+ &ctxt->dst.val, ctxt->dst.bytes);
if (rc != X86EMUL_CONTINUE)
goto done;
}
- c->dst.orig_val = c->dst.val;
+ ctxt->dst.orig_val = ctxt->dst.val;
special_insn:
- if (c->execute) {
- rc = c->execute(ctxt);
+ if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
+ rc = emulator_check_intercept(ctxt, ctxt->intercept,
+ X86_ICPT_POST_MEMACCESS);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ if (ctxt->execute) {
+ rc = ctxt->execute(ctxt);
if (rc != X86EMUL_CONTINUE)
goto done;
goto writeback;
}
- if (c->twobyte)
+ if (ctxt->twobyte)
goto twobyte_insn;
- switch (c->b) {
- case 0x00 ... 0x05:
- add: /* add */
- emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
- break;
+ switch (ctxt->b) {
case 0x06: /* push es */
- emulate_push_sreg(ctxt, ops, VCPU_SREG_ES);
+ rc = emulate_push_sreg(ctxt, VCPU_SREG_ES);
break;
case 0x07: /* pop es */
- rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
- break;
- case 0x08 ... 0x0d:
- or: /* or */
- emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
+ rc = emulate_pop_sreg(ctxt, VCPU_SREG_ES);
break;
case 0x0e: /* push cs */
- emulate_push_sreg(ctxt, ops, VCPU_SREG_CS);
- break;
- case 0x10 ... 0x15:
- adc: /* adc */
- emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
+ rc = emulate_push_sreg(ctxt, VCPU_SREG_CS);
break;
case 0x16: /* push ss */
- emulate_push_sreg(ctxt, ops, VCPU_SREG_SS);
+ rc = emulate_push_sreg(ctxt, VCPU_SREG_SS);
break;
case 0x17: /* pop ss */
- rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
- break;
- case 0x18 ... 0x1d:
- sbb: /* sbb */
- emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
+ rc = emulate_pop_sreg(ctxt, VCPU_SREG_SS);
break;
case 0x1e: /* push ds */
- emulate_push_sreg(ctxt, ops, VCPU_SREG_DS);
+ rc = emulate_push_sreg(ctxt, VCPU_SREG_DS);
break;
case 0x1f: /* pop ds */
- rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
- break;
- case 0x20 ... 0x25:
- and: /* and */
- emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
- break;
- case 0x28 ... 0x2d:
- sub: /* sub */
- emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
- break;
- case 0x30 ... 0x35:
- xor: /* xor */
- emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
- break;
- case 0x38 ... 0x3d:
- cmp: /* cmp */
- emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
+ rc = emulate_pop_sreg(ctxt, VCPU_SREG_DS);
break;
case 0x40 ... 0x47: /* inc r16/r32 */
- emulate_1op("inc", c->dst, ctxt->eflags);
+ emulate_1op("inc", ctxt->dst, ctxt->eflags);
break;
case 0x48 ... 0x4f: /* dec r16/r32 */
- emulate_1op("dec", c->dst, ctxt->eflags);
- break;
- case 0x58 ... 0x5f: /* pop reg */
- pop_instruction:
- rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes);
- break;
- case 0x60: /* pusha */
- rc = emulate_pusha(ctxt, ops);
- break;
- case 0x61: /* popa */
- rc = emulate_popa(ctxt, ops);
+ emulate_1op("dec", ctxt->dst, ctxt->eflags);
break;
case 0x63: /* movsxd */
if (ctxt->mode != X86EMUL_MODE_PROT64)
goto cannot_emulate;
- c->dst.val = (s32) c->src.val;
+ ctxt->dst.val = (s32) ctxt->src.val;
break;
case 0x6c: /* insb */
case 0x6d: /* insw/insd */
- c->src.val = c->regs[VCPU_REGS_RDX];
+ ctxt->src.val = ctxt->regs[VCPU_REGS_RDX];
goto do_io_in;
case 0x6e: /* outsb */
case 0x6f: /* outsw/outsd */
- c->dst.val = c->regs[VCPU_REGS_RDX];
+ ctxt->dst.val = ctxt->regs[VCPU_REGS_RDX];
goto do_io_out;
break;
case 0x70 ... 0x7f: /* jcc (short) */
- if (test_cc(c->b, ctxt->eflags))
- jmp_rel(c, c->src.val);
- break;
- case 0x80 ... 0x83: /* Grp1 */
- switch (c->modrm_reg) {
- case 0:
- goto add;
- case 1:
- goto or;
- case 2:
- goto adc;
- case 3:
- goto sbb;
- case 4:
- goto and;
- case 5:
- goto sub;
- case 6:
- goto xor;
- case 7:
- goto cmp;
- }
- break;
- case 0x84 ... 0x85:
- test:
- emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
- break;
- case 0x86 ... 0x87: /* xchg */
- xchg:
- /* Write back the register source. */
- c->src.val = c->dst.val;
- write_register_operand(&c->src);
- /*
- * Write back the memory destination with implicit LOCK
- * prefix.
- */
- c->dst.val = c->src.orig_val;
- c->lock_prefix = 1;
- break;
- case 0x8c: /* mov r/m, sreg */
- if (c->modrm_reg > VCPU_SREG_GS) {
- rc = emulate_ud(ctxt);
- goto done;
- }
- c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
+ if (test_cc(ctxt->b, ctxt->eflags))
+ jmp_rel(ctxt, ctxt->src.val);
break;
case 0x8d: /* lea r16/r32, m */
- c->dst.val = c->src.addr.mem.ea;
- break;
- case 0x8e: { /* mov seg, r/m16 */
- uint16_t sel;
-
- sel = c->src.val;
-
- if (c->modrm_reg == VCPU_SREG_CS ||
- c->modrm_reg > VCPU_SREG_GS) {
- rc = emulate_ud(ctxt);
- goto done;
- }
-
- if (c->modrm_reg == VCPU_SREG_SS)
- ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
-
- rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg);
-
- c->dst.type = OP_NONE; /* Disable writeback. */
+ ctxt->dst.val = ctxt->src.addr.mem.ea;
break;
- }
case 0x8f: /* pop (sole member of Grp1a) */
- rc = emulate_grp1a(ctxt, ops);
+ rc = em_grp1a(ctxt);
break;
case 0x90 ... 0x97: /* nop / xchg reg, rax */
- if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX])
+ if (ctxt->dst.addr.reg == &ctxt->regs[VCPU_REGS_RAX])
break;
- goto xchg;
+ rc = em_xchg(ctxt);
+ break;
case 0x98: /* cbw/cwde/cdqe */
- switch (c->op_bytes) {
- case 2: c->dst.val = (s8)c->dst.val; break;
- case 4: c->dst.val = (s16)c->dst.val; break;
- case 8: c->dst.val = (s32)c->dst.val; break;
+ switch (ctxt->op_bytes) {
+ case 2: ctxt->dst.val = (s8)ctxt->dst.val; break;
+ case 4: ctxt->dst.val = (s16)ctxt->dst.val; break;
+ case 8: ctxt->dst.val = (s32)ctxt->dst.val; break;
}
break;
- case 0x9c: /* pushf */
- c->src.val = (unsigned long) ctxt->eflags;
- emulate_push(ctxt, ops);
- break;
- case 0x9d: /* popf */
- c->dst.type = OP_REG;
- c->dst.addr.reg = &ctxt->eflags;
- c->dst.bytes = c->op_bytes;
- rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes);
- break;
- case 0xa6 ... 0xa7: /* cmps */
- c->dst.type = OP_NONE; /* Disable writeback. */
- goto cmp;
- case 0xa8 ... 0xa9: /* test ax, imm */
- goto test;
- case 0xae ... 0xaf: /* scas */
- goto cmp;
case 0xc0 ... 0xc1:
- emulate_grp2(ctxt);
+ rc = em_grp2(ctxt);
break;
- case 0xc3: /* ret */
- c->dst.type = OP_REG;
- c->dst.addr.reg = &c->eip;
- c->dst.bytes = c->op_bytes;
- goto pop_instruction;
case 0xc4: /* les */
- rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES);
+ rc = emulate_load_segment(ctxt, VCPU_SREG_ES);
break;
case 0xc5: /* lds */
- rc = emulate_load_segment(ctxt, ops, VCPU_SREG_DS);
- break;
- case 0xcb: /* ret far */
- rc = emulate_ret_far(ctxt, ops);
+ rc = emulate_load_segment(ctxt, VCPU_SREG_DS);
break;
case 0xcc: /* int3 */
- irq = 3;
- goto do_interrupt;
+ rc = emulate_int(ctxt, 3);
+ break;
case 0xcd: /* int n */
- irq = c->src.val;
- do_interrupt:
- rc = emulate_int(ctxt, ops, irq);
+ rc = emulate_int(ctxt, ctxt->src.val);
break;
case 0xce: /* into */
- if (ctxt->eflags & EFLG_OF) {
- irq = 4;
- goto do_interrupt;
- }
- break;
- case 0xcf: /* iret */
- rc = emulate_iret(ctxt, ops);
+ if (ctxt->eflags & EFLG_OF)
+ rc = emulate_int(ctxt, 4);
break;
case 0xd0 ... 0xd1: /* Grp2 */
- emulate_grp2(ctxt);
+ rc = em_grp2(ctxt);
break;
case 0xd2 ... 0xd3: /* Grp2 */
- c->src.val = c->regs[VCPU_REGS_RCX];
- emulate_grp2(ctxt);
- break;
- case 0xe0 ... 0xe2: /* loop/loopz/loopnz */
- register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
- if (address_mask(c, c->regs[VCPU_REGS_RCX]) != 0 &&
- (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags)))
- jmp_rel(c, c->src.val);
- break;
- case 0xe3: /* jcxz/jecxz/jrcxz */
- if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0)
- jmp_rel(c, c->src.val);
+ ctxt->src.val = ctxt->regs[VCPU_REGS_RCX];
+ rc = em_grp2(ctxt);
break;
case 0xe4: /* inb */
case 0xe5: /* in */
@@ -3275,67 +3921,40 @@ special_insn:
case 0xe7: /* out */
goto do_io_out;
case 0xe8: /* call (near) */ {
- long int rel = c->src.val;
- c->src.val = (unsigned long) c->eip;
- jmp_rel(c, rel);
- emulate_push(ctxt, ops);
+ long int rel = ctxt->src.val;
+ ctxt->src.val = (unsigned long) ctxt->_eip;
+ jmp_rel(ctxt, rel);
+ rc = em_push(ctxt);
break;
}
case 0xe9: /* jmp rel */
- goto jmp;
- case 0xea: { /* jmp far */
- unsigned short sel;
- jump_far:
- memcpy(&sel, c->src.valptr + c->op_bytes, 2);
-
- if (load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS))
- goto done;
-
- c->eip = 0;
- memcpy(&c->eip, c->src.valptr, c->op_bytes);
- break;
- }
- case 0xeb:
- jmp: /* jmp rel short */
- jmp_rel(c, c->src.val);
- c->dst.type = OP_NONE; /* Disable writeback. */
+ case 0xeb: /* jmp rel short */
+ jmp_rel(ctxt, ctxt->src.val);
+ ctxt->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xec: /* in al,dx */
case 0xed: /* in (e/r)ax,dx */
- c->src.val = c->regs[VCPU_REGS_RDX];
do_io_in:
- c->dst.bytes = min(c->dst.bytes, 4u);
- if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
- rc = emulate_gp(ctxt, 0);
- goto done;
- }
- if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
- &c->dst.val))
+ if (!pio_in_emulated(ctxt, ctxt->dst.bytes, ctxt->src.val,
+ &ctxt->dst.val))
goto done; /* IO is needed */
break;
case 0xee: /* out dx,al */
case 0xef: /* out dx,(e/r)ax */
- c->dst.val = c->regs[VCPU_REGS_RDX];
do_io_out:
- c->src.bytes = min(c->src.bytes, 4u);
- if (!emulator_io_permited(ctxt, ops, c->dst.val,
- c->src.bytes)) {
- rc = emulate_gp(ctxt, 0);
- goto done;
- }
- ops->pio_out_emulated(c->src.bytes, c->dst.val,
- &c->src.val, 1, ctxt->vcpu);
- c->dst.type = OP_NONE; /* Disable writeback. */
+ ops->pio_out_emulated(ctxt, ctxt->src.bytes, ctxt->dst.val,
+ &ctxt->src.val, 1);
+ ctxt->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xf4: /* hlt */
- ctxt->vcpu->arch.halt_request = 1;
+ ctxt->ops->halt(ctxt);
break;
case 0xf5: /* cmc */
/* complement carry flag from eflags reg */
ctxt->eflags ^= EFLG_CF;
break;
case 0xf6 ... 0xf7: /* Grp3 */
- rc = emulate_grp3(ctxt, ops);
+ rc = em_grp3(ctxt);
break;
case 0xf8: /* clc */
ctxt->eflags &= ~EFLG_CF;
@@ -3343,22 +3962,6 @@ special_insn:
case 0xf9: /* stc */
ctxt->eflags |= EFLG_CF;
break;
- case 0xfa: /* cli */
- if (emulator_bad_iopl(ctxt, ops)) {
- rc = emulate_gp(ctxt, 0);
- goto done;
- } else
- ctxt->eflags &= ~X86_EFLAGS_IF;
- break;
- case 0xfb: /* sti */
- if (emulator_bad_iopl(ctxt, ops)) {
- rc = emulate_gp(ctxt, 0);
- goto done;
- } else {
- ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
- ctxt->eflags |= X86_EFLAGS_IF;
- }
- break;
case 0xfc: /* cld */
ctxt->eflags &= ~EFLG_DF;
break;
@@ -3366,13 +3969,11 @@ special_insn:
ctxt->eflags |= EFLG_DF;
break;
case 0xfe: /* Grp4 */
- grp45:
- rc = emulate_grp45(ctxt, ops);
+ rc = em_grp45(ctxt);
break;
case 0xff: /* Grp5 */
- if (c->modrm_reg == 5)
- goto jump_far;
- goto grp45;
+ rc = em_grp45(ctxt);
+ break;
default:
goto cannot_emulate;
}
@@ -3381,7 +3982,7 @@ special_insn:
goto done;
writeback:
- rc = writeback(ctxt, ops);
+ rc = writeback(ctxt);
if (rc != X86EMUL_CONTINUE)
goto done;
@@ -3389,183 +3990,89 @@ writeback:
* restore dst type in case the decoding will be reused
* (happens for string instruction )
*/
- c->dst.type = saved_dst_type;
+ ctxt->dst.type = saved_dst_type;
- if ((c->d & SrcMask) == SrcSI)
- string_addr_inc(ctxt, seg_override(ctxt, ops, c),
- VCPU_REGS_RSI, &c->src);
+ if ((ctxt->d & SrcMask) == SrcSI)
+ string_addr_inc(ctxt, seg_override(ctxt),
+ VCPU_REGS_RSI, &ctxt->src);
- if ((c->d & DstMask) == DstDI)
+ if ((ctxt->d & DstMask) == DstDI)
string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI,
- &c->dst);
+ &ctxt->dst);
- if (c->rep_prefix && (c->d & String)) {
- struct read_cache *r = &ctxt->decode.io_read;
- register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
+ if (ctxt->rep_prefix && (ctxt->d & String)) {
+ struct read_cache *r = &ctxt->io_read;
+ register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1);
if (!string_insn_completed(ctxt)) {
/*
* Re-enter guest when pio read ahead buffer is empty
* or, if it is not used, after each 1024 iteration.
*/
- if ((r->end != 0 || c->regs[VCPU_REGS_RCX] & 0x3ff) &&
+ if ((r->end != 0 || ctxt->regs[VCPU_REGS_RCX] & 0x3ff) &&
(r->end == 0 || r->end != r->pos)) {
/*
* Reset read cache. Usually happens before
* decode, but since instruction is restarted
* we have to do it here.
*/
- ctxt->decode.mem_read.end = 0;
+ ctxt->mem_read.end = 0;
return EMULATION_RESTART;
}
goto done; /* skip rip writeback */
}
}
- ctxt->eip = c->eip;
+ ctxt->eip = ctxt->_eip;
done:
if (rc == X86EMUL_PROPAGATE_FAULT)
ctxt->have_exception = true;
+ if (rc == X86EMUL_INTERCEPTED)
+ return EMULATION_INTERCEPTED;
+
return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
twobyte_insn:
- switch (c->b) {
- case 0x01: /* lgdt, lidt, lmsw */
- switch (c->modrm_reg) {
- u16 size;
- unsigned long address;
-
- case 0: /* vmcall */
- if (c->modrm_mod != 3 || c->modrm_rm != 1)
- goto cannot_emulate;
-
- rc = kvm_fix_hypercall(ctxt->vcpu);
- if (rc != X86EMUL_CONTINUE)
- goto done;
-
- /* Let the processor re-execute the fixed hypercall */
- c->eip = ctxt->eip;
- /* Disable writeback. */
- c->dst.type = OP_NONE;
- break;
- case 2: /* lgdt */
- rc = read_descriptor(ctxt, ops, c->src.addr.mem,
- &size, &address, c->op_bytes);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- realmode_lgdt(ctxt->vcpu, size, address);
- /* Disable writeback. */
- c->dst.type = OP_NONE;
- break;
- case 3: /* lidt/vmmcall */
- if (c->modrm_mod == 3) {
- switch (c->modrm_rm) {
- case 1:
- rc = kvm_fix_hypercall(ctxt->vcpu);
- break;
- default:
- goto cannot_emulate;
- }
- } else {
- rc = read_descriptor(ctxt, ops, c->src.addr.mem,
- &size, &address,
- c->op_bytes);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- realmode_lidt(ctxt->vcpu, size, address);
- }
- /* Disable writeback. */
- c->dst.type = OP_NONE;
- break;
- case 4: /* smsw */
- c->dst.bytes = 2;
- c->dst.val = ops->get_cr(0, ctxt->vcpu);
- break;
- case 6: /* lmsw */
- ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0eul) |
- (c->src.val & 0x0f), ctxt->vcpu);
- c->dst.type = OP_NONE;
- break;
- case 5: /* not defined */
- emulate_ud(ctxt);
- rc = X86EMUL_PROPAGATE_FAULT;
- goto done;
- case 7: /* invlpg*/
- emulate_invlpg(ctxt->vcpu,
- linear(ctxt, c->src.addr.mem));
- /* Disable writeback. */
- c->dst.type = OP_NONE;
- break;
- default:
- goto cannot_emulate;
- }
- break;
- case 0x05: /* syscall */
- rc = emulate_syscall(ctxt, ops);
- break;
- case 0x06:
- emulate_clts(ctxt->vcpu);
- break;
+ switch (ctxt->b) {
case 0x09: /* wbinvd */
- kvm_emulate_wbinvd(ctxt->vcpu);
+ (ctxt->ops->wbinvd)(ctxt);
break;
case 0x08: /* invd */
case 0x0d: /* GrpP (prefetch) */
case 0x18: /* Grp16 (prefetch/nop) */
break;
case 0x20: /* mov cr, reg */
- switch (c->modrm_reg) {
- case 1:
- case 5 ... 7:
- case 9 ... 15:
- emulate_ud(ctxt);
- rc = X86EMUL_PROPAGATE_FAULT;
- goto done;
- }
- c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu);
+ ctxt->dst.val = ops->get_cr(ctxt, ctxt->modrm_reg);
break;
case 0x21: /* mov from dr to reg */
- if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
- (c->modrm_reg == 4 || c->modrm_reg == 5)) {
- emulate_ud(ctxt);
- rc = X86EMUL_PROPAGATE_FAULT;
- goto done;
- }
- ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu);
+ ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val);
break;
case 0x22: /* mov reg, cr */
- if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) {
+ if (ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) {
emulate_gp(ctxt, 0);
rc = X86EMUL_PROPAGATE_FAULT;
goto done;
}
- c->dst.type = OP_NONE;
+ ctxt->dst.type = OP_NONE;
break;
case 0x23: /* mov from reg to dr */
- if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
- (c->modrm_reg == 4 || c->modrm_reg == 5)) {
- emulate_ud(ctxt);
- rc = X86EMUL_PROPAGATE_FAULT;
- goto done;
- }
-
- if (ops->set_dr(c->modrm_reg, c->src.val &
+ if (ops->set_dr(ctxt, ctxt->modrm_reg, ctxt->src.val &
((ctxt->mode == X86EMUL_MODE_PROT64) ?
- ~0ULL : ~0U), ctxt->vcpu) < 0) {
+ ~0ULL : ~0U)) < 0) {
/* #UD condition is already handled by the code above */
emulate_gp(ctxt, 0);
rc = X86EMUL_PROPAGATE_FAULT;
goto done;
}
- c->dst.type = OP_NONE; /* no writeback */
+ ctxt->dst.type = OP_NONE; /* no writeback */
break;
case 0x30:
/* wrmsr */
- msr_data = (u32)c->regs[VCPU_REGS_RAX]
- | ((u64)c->regs[VCPU_REGS_RDX] << 32);
- if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
+ msr_data = (u32)ctxt->regs[VCPU_REGS_RAX]
+ | ((u64)ctxt->regs[VCPU_REGS_RDX] << 32);
+ if (ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data)) {
emulate_gp(ctxt, 0);
rc = X86EMUL_PROPAGATE_FAULT;
goto done;
@@ -3574,64 +4081,58 @@ twobyte_insn:
break;
case 0x32:
/* rdmsr */
- if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
+ if (ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data)) {
emulate_gp(ctxt, 0);
rc = X86EMUL_PROPAGATE_FAULT;
goto done;
} else {
- c->regs[VCPU_REGS_RAX] = (u32)msr_data;
- c->regs[VCPU_REGS_RDX] = msr_data >> 32;
+ ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data;
+ ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32;
}
rc = X86EMUL_CONTINUE;
break;
- case 0x34: /* sysenter */
- rc = emulate_sysenter(ctxt, ops);
- break;
- case 0x35: /* sysexit */
- rc = emulate_sysexit(ctxt, ops);
- break;
case 0x40 ... 0x4f: /* cmov */
- c->dst.val = c->dst.orig_val = c->src.val;
- if (!test_cc(c->b, ctxt->eflags))
- c->dst.type = OP_NONE; /* no writeback */
+ ctxt->dst.val = ctxt->dst.orig_val = ctxt->src.val;
+ if (!test_cc(ctxt->b, ctxt->eflags))
+ ctxt->dst.type = OP_NONE; /* no writeback */
break;
case 0x80 ... 0x8f: /* jnz rel, etc*/
- if (test_cc(c->b, ctxt->eflags))
- jmp_rel(c, c->src.val);
+ if (test_cc(ctxt->b, ctxt->eflags))
+ jmp_rel(ctxt, ctxt->src.val);
break;
case 0x90 ... 0x9f: /* setcc r/m8 */
- c->dst.val = test_cc(c->b, ctxt->eflags);
+ ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags);
break;
case 0xa0: /* push fs */
- emulate_push_sreg(ctxt, ops, VCPU_SREG_FS);
+ rc = emulate_push_sreg(ctxt, VCPU_SREG_FS);
break;
case 0xa1: /* pop fs */
- rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
+ rc = emulate_pop_sreg(ctxt, VCPU_SREG_FS);
break;
case 0xa3:
bt: /* bt */
- c->dst.type = OP_NONE;
+ ctxt->dst.type = OP_NONE;
/* only subword offset */
- c->src.val &= (c->dst.bytes << 3) - 1;
- emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags);
+ ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
+ emulate_2op_SrcV_nobyte("bt", ctxt->src, ctxt->dst, ctxt->eflags);
break;
case 0xa4: /* shld imm8, r, r/m */
case 0xa5: /* shld cl, r, r/m */
- emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
+ emulate_2op_cl("shld", ctxt->src2, ctxt->src, ctxt->dst, ctxt->eflags);
break;
case 0xa8: /* push gs */
- emulate_push_sreg(ctxt, ops, VCPU_SREG_GS);
+ rc = emulate_push_sreg(ctxt, VCPU_SREG_GS);
break;
case 0xa9: /* pop gs */
- rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
+ rc = emulate_pop_sreg(ctxt, VCPU_SREG_GS);
break;
case 0xab:
bts: /* bts */
- emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
+ emulate_2op_SrcV_nobyte("bts", ctxt->src, ctxt->dst, ctxt->eflags);
break;
case 0xac: /* shrd imm8, r, r/m */
case 0xad: /* shrd cl, r, r/m */
- emulate_2op_cl("shrd", c->src2, c->src, c->dst, ctxt->eflags);
+ emulate_2op_cl("shrd", ctxt->src2, ctxt->src, ctxt->dst, ctxt->eflags);
break;
case 0xae: /* clflush */
break;
@@ -3640,38 +4141,38 @@ twobyte_insn:
* Save real source value, then compare EAX against
* destination.
*/
- c->src.orig_val = c->src.val;
- c->src.val = c->regs[VCPU_REGS_RAX];
- emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
+ ctxt->src.orig_val = ctxt->src.val;
+ ctxt->src.val = ctxt->regs[VCPU_REGS_RAX];
+ emulate_2op_SrcV("cmp", ctxt->src, ctxt->dst, ctxt->eflags);
if (ctxt->eflags & EFLG_ZF) {
/* Success: write back to memory. */
- c->dst.val = c->src.orig_val;
+ ctxt->dst.val = ctxt->src.orig_val;
} else {
/* Failure: write the value we saw to EAX. */
- c->dst.type = OP_REG;
- c->dst.addr.reg = (unsigned long *)&c->regs[VCPU_REGS_RAX];
+ ctxt->dst.type = OP_REG;
+ ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX];
}
break;
case 0xb2: /* lss */
- rc = emulate_load_segment(ctxt, ops, VCPU_SREG_SS);
+ rc = emulate_load_segment(ctxt, VCPU_SREG_SS);
break;
case 0xb3:
btr: /* btr */
- emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
+ emulate_2op_SrcV_nobyte("btr", ctxt->src, ctxt->dst, ctxt->eflags);
break;
case 0xb4: /* lfs */
- rc = emulate_load_segment(ctxt, ops, VCPU_SREG_FS);
+ rc = emulate_load_segment(ctxt, VCPU_SREG_FS);
break;
case 0xb5: /* lgs */
- rc = emulate_load_segment(ctxt, ops, VCPU_SREG_GS);
+ rc = emulate_load_segment(ctxt, VCPU_SREG_GS);
break;
case 0xb6 ... 0xb7: /* movzx */
- c->dst.bytes = c->op_bytes;
- c->dst.val = (c->d & ByteOp) ? (u8) c->src.val
- : (u16) c->src.val;
+ ctxt->dst.bytes = ctxt->op_bytes;
+ ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val
+ : (u16) ctxt->src.val;
break;
case 0xba: /* Grp8 */
- switch (c->modrm_reg & 3) {
+ switch (ctxt->modrm_reg & 3) {
case 0:
goto bt;
case 1:
@@ -3684,50 +4185,50 @@ twobyte_insn:
break;
case 0xbb:
btc: /* btc */
- emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
+ emulate_2op_SrcV_nobyte("btc", ctxt->src, ctxt->dst, ctxt->eflags);
break;
case 0xbc: { /* bsf */
u8 zf;
__asm__ ("bsf %2, %0; setz %1"
- : "=r"(c->dst.val), "=q"(zf)
- : "r"(c->src.val));
+ : "=r"(ctxt->dst.val), "=q"(zf)
+ : "r"(ctxt->src.val));
ctxt->eflags &= ~X86_EFLAGS_ZF;
if (zf) {
ctxt->eflags |= X86_EFLAGS_ZF;
- c->dst.type = OP_NONE; /* Disable writeback. */
+ ctxt->dst.type = OP_NONE; /* Disable writeback. */
}
break;
}
case 0xbd: { /* bsr */
u8 zf;
__asm__ ("bsr %2, %0; setz %1"
- : "=r"(c->dst.val), "=q"(zf)
- : "r"(c->src.val));
+ : "=r"(ctxt->dst.val), "=q"(zf)
+ : "r"(ctxt->src.val));
ctxt->eflags &= ~X86_EFLAGS_ZF;
if (zf) {
ctxt->eflags |= X86_EFLAGS_ZF;
- c->dst.type = OP_NONE; /* Disable writeback. */
+ ctxt->dst.type = OP_NONE; /* Disable writeback. */
}
break;
}
case 0xbe ... 0xbf: /* movsx */
- c->dst.bytes = c->op_bytes;
- c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
- (s16) c->src.val;
+ ctxt->dst.bytes = ctxt->op_bytes;
+ ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val :
+ (s16) ctxt->src.val;
break;
case 0xc0 ... 0xc1: /* xadd */
- emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
+ emulate_2op_SrcV("add", ctxt->src, ctxt->dst, ctxt->eflags);
/* Write back the register source. */
- c->src.val = c->dst.orig_val;
- write_register_operand(&c->src);
+ ctxt->src.val = ctxt->dst.orig_val;
+ write_register_operand(&ctxt->src);
break;
case 0xc3: /* movnti */
- c->dst.bytes = c->op_bytes;
- c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val :
- (u64) c->src.val;
+ ctxt->dst.bytes = ctxt->op_bytes;
+ ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val :
+ (u64) ctxt->src.val;
break;
case 0xc7: /* Grp9 (cmpxchg8b) */
- rc = emulate_grp9(ctxt, ops);
+ rc = em_grp9(ctxt);
break;
default:
goto cannot_emulate;
@@ -3739,5 +4240,5 @@ twobyte_insn:
goto writeback;
cannot_emulate:
- return -1;
+ return EMULATION_FAILED;
}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 46d08ca0b48..51a97426e79 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -33,7 +33,6 @@ struct kvm_kpit_state {
};
struct kvm_pit {
- unsigned long base_addresss;
struct kvm_io_device dev;
struct kvm_io_device speaker_dev;
struct kvm *kvm;
@@ -51,7 +50,6 @@ struct kvm_pit {
#define KVM_MAX_PIT_INTR_INTERVAL HZ / 100
#define KVM_PIT_CHANNEL_MASK 0x3
-void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start);
struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
void kvm_free_pit(struct kvm *kvm);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index ba910d14941..53e2d084bff 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -75,7 +75,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm);
void kvm_destroy_pic(struct kvm *kvm);
int kvm_pic_read_irq(struct kvm *kvm);
void kvm_pic_update_irq(struct kvm_pic *s);
-void kvm_pic_clear_isr_ack(struct kvm *kvm);
static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
{
@@ -100,7 +99,6 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
-int pit_has_pending_timer(struct kvm_vcpu *vcpu);
int apic_has_pending_timer(struct kvm_vcpu *vcpu);
#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 2b2255b1f04..57dcbd4308f 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -33,7 +33,7 @@
#include <asm/page.h>
#include <asm/current.h>
#include <asm/apicdef.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include "kvm_cache_regs.h"
#include "irq.h"
#include "trace.h"
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 22fae7593ee..1c5b69373a0 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -22,7 +22,6 @@
#include "mmu.h"
#include "x86.h"
#include "kvm_cache_regs.h"
-#include "x86.h"
#include <linux/kvm_host.h>
#include <linux/types.h>
@@ -148,7 +147,7 @@ module_param(oos_shadow, bool, 0644);
#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
| PT64_NX_MASK)
-#define RMAP_EXT 4
+#define PTE_LIST_EXT 4
#define ACC_EXEC_MASK 1
#define ACC_WRITE_MASK PT_WRITABLE_MASK
@@ -164,16 +163,16 @@ module_param(oos_shadow, bool, 0644);
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
-struct kvm_rmap_desc {
- u64 *sptes[RMAP_EXT];
- struct kvm_rmap_desc *more;
+struct pte_list_desc {
+ u64 *sptes[PTE_LIST_EXT];
+ struct pte_list_desc *more;
};
struct kvm_shadow_walk_iterator {
u64 addr;
hpa_t shadow_addr;
- int level;
u64 *sptep;
+ int level;
unsigned index;
};
@@ -182,32 +181,68 @@ struct kvm_shadow_walk_iterator {
shadow_walk_okay(&(_walker)); \
shadow_walk_next(&(_walker)))
-typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
+#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \
+ for (shadow_walk_init(&(_walker), _vcpu, _addr); \
+ shadow_walk_okay(&(_walker)) && \
+ ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \
+ __shadow_walk_next(&(_walker), spte))
-static struct kmem_cache *pte_chain_cache;
-static struct kmem_cache *rmap_desc_cache;
+static struct kmem_cache *pte_list_desc_cache;
static struct kmem_cache *mmu_page_header_cache;
static struct percpu_counter kvm_total_used_mmu_pages;
-static u64 __read_mostly shadow_trap_nonpresent_pte;
-static u64 __read_mostly shadow_notrap_nonpresent_pte;
static u64 __read_mostly shadow_nx_mask;
static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
static u64 __read_mostly shadow_user_mask;
static u64 __read_mostly shadow_accessed_mask;
static u64 __read_mostly shadow_dirty_mask;
+static u64 __read_mostly shadow_mmio_mask;
-static inline u64 rsvd_bits(int s, int e)
+static void mmu_spte_set(u64 *sptep, u64 spte);
+
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
{
- return ((1ULL << (e - s + 1)) - 1) << s;
+ shadow_mmio_mask = mmio_mask;
}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
-void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
+static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
{
- shadow_trap_nonpresent_pte = trap_pte;
- shadow_notrap_nonpresent_pte = notrap_pte;
+ access &= ACC_WRITE_MASK | ACC_USER_MASK;
+
+ trace_mark_mmio_spte(sptep, gfn, access);
+ mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
+}
+
+static bool is_mmio_spte(u64 spte)
+{
+ return (spte & shadow_mmio_mask) == shadow_mmio_mask;
+}
+
+static gfn_t get_mmio_spte_gfn(u64 spte)
+{
+ return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT;
+}
+
+static unsigned get_mmio_spte_access(u64 spte)
+{
+ return (spte & ~shadow_mmio_mask) & ~PAGE_MASK;
+}
+
+static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access)
+{
+ if (unlikely(is_noslot_pfn(pfn))) {
+ mark_mmio_spte(sptep, gfn, access);
+ return true;
+ }
+
+ return false;
+}
+
+static inline u64 rsvd_bits(int s, int e)
+{
+ return ((1ULL << (e - s + 1)) - 1) << s;
}
-EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask)
@@ -220,11 +255,6 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
-static bool is_write_protection(struct kvm_vcpu *vcpu)
-{
- return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
-}
-
static int is_cpuid_PSE36(void)
{
return 1;
@@ -237,8 +267,7 @@ static int is_nx(struct kvm_vcpu *vcpu)
static int is_shadow_present_pte(u64 pte)
{
- return pte != shadow_trap_nonpresent_pte
- && pte != shadow_notrap_nonpresent_pte;
+ return pte & PT_PRESENT_MASK && !is_mmio_spte(pte);
}
static int is_large_pte(u64 pte)
@@ -246,11 +275,6 @@ static int is_large_pte(u64 pte)
return pte & PT_PAGE_SIZE_MASK;
}
-static int is_writable_pte(unsigned long pte)
-{
- return pte & PT_WRITABLE_MASK;
-}
-
static int is_dirty_gpte(unsigned long pte)
{
return pte & PT_DIRTY_MASK;
@@ -282,25 +306,153 @@ static gfn_t pse36_gfn_delta(u32 gpte)
return (gpte & PT32_DIR_PSE36_MASK) << shift;
}
+#ifdef CONFIG_X86_64
static void __set_spte(u64 *sptep, u64 spte)
{
- set_64bit(sptep, spte);
+ *sptep = spte;
}
-static u64 __xchg_spte(u64 *sptep, u64 new_spte)
+static void __update_clear_spte_fast(u64 *sptep, u64 spte)
{
-#ifdef CONFIG_X86_64
- return xchg(sptep, new_spte);
+ *sptep = spte;
+}
+
+static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
+{
+ return xchg(sptep, spte);
+}
+
+static u64 __get_spte_lockless(u64 *sptep)
+{
+ return ACCESS_ONCE(*sptep);
+}
+
+static bool __check_direct_spte_mmio_pf(u64 spte)
+{
+ /* It is valid if the spte is zapped. */
+ return spte == 0ull;
+}
#else
- u64 old_spte;
+union split_spte {
+ struct {
+ u32 spte_low;
+ u32 spte_high;
+ };
+ u64 spte;
+};
- do {
- old_spte = *sptep;
- } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
+static void count_spte_clear(u64 *sptep, u64 spte)
+{
+ struct kvm_mmu_page *sp = page_header(__pa(sptep));
- return old_spte;
-#endif
+ if (is_shadow_present_pte(spte))
+ return;
+
+ /* Ensure the spte is completely set before we increase the count */
+ smp_wmb();
+ sp->clear_spte_count++;
+}
+
+static void __set_spte(u64 *sptep, u64 spte)
+{
+ union split_spte *ssptep, sspte;
+
+ ssptep = (union split_spte *)sptep;
+ sspte = (union split_spte)spte;
+
+ ssptep->spte_high = sspte.spte_high;
+
+ /*
+ * If we map the spte from nonpresent to present, We should store
+ * the high bits firstly, then set present bit, so cpu can not
+ * fetch this spte while we are setting the spte.
+ */
+ smp_wmb();
+
+ ssptep->spte_low = sspte.spte_low;
+}
+
+static void __update_clear_spte_fast(u64 *sptep, u64 spte)
+{
+ union split_spte *ssptep, sspte;
+
+ ssptep = (union split_spte *)sptep;
+ sspte = (union split_spte)spte;
+
+ ssptep->spte_low = sspte.spte_low;
+
+ /*
+ * If we map the spte from present to nonpresent, we should clear
+ * present bit firstly to avoid vcpu fetch the old high bits.
+ */
+ smp_wmb();
+
+ ssptep->spte_high = sspte.spte_high;
+ count_spte_clear(sptep, spte);
+}
+
+static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
+{
+ union split_spte *ssptep, sspte, orig;
+
+ ssptep = (union split_spte *)sptep;
+ sspte = (union split_spte)spte;
+
+ /* xchg acts as a barrier before the setting of the high bits */
+ orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
+ orig.spte_high = ssptep->spte_high = sspte.spte_high;
+ count_spte_clear(sptep, spte);
+
+ return orig.spte;
+}
+
+/*
+ * The idea using the light way get the spte on x86_32 guest is from
+ * gup_get_pte(arch/x86/mm/gup.c).
+ * The difference is we can not catch the spte tlb flush if we leave
+ * guest mode, so we emulate it by increase clear_spte_count when spte
+ * is cleared.
+ */
+static u64 __get_spte_lockless(u64 *sptep)
+{
+ struct kvm_mmu_page *sp = page_header(__pa(sptep));
+ union split_spte spte, *orig = (union split_spte *)sptep;
+ int count;
+
+retry:
+ count = sp->clear_spte_count;
+ smp_rmb();
+
+ spte.spte_low = orig->spte_low;
+ smp_rmb();
+
+ spte.spte_high = orig->spte_high;
+ smp_rmb();
+
+ if (unlikely(spte.spte_low != orig->spte_low ||
+ count != sp->clear_spte_count))
+ goto retry;
+
+ return spte.spte;
+}
+
+static bool __check_direct_spte_mmio_pf(u64 spte)
+{
+ union split_spte sspte = (union split_spte)spte;
+ u32 high_mmio_mask = shadow_mmio_mask >> 32;
+
+ /* It is valid if the spte is zapped. */
+ if (spte == 0ull)
+ return true;
+
+ /* It is valid if the spte is being zapped. */
+ if (sspte.spte_low == 0ull &&
+ (sspte.spte_high & high_mmio_mask) == high_mmio_mask)
+ return true;
+
+ return false;
}
+#endif
static bool spte_has_volatile_bits(u64 spte)
{
@@ -322,12 +474,30 @@ static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
return (old_spte & bit_mask) && !(new_spte & bit_mask);
}
-static void update_spte(u64 *sptep, u64 new_spte)
+/* Rules for using mmu_spte_set:
+ * Set the sptep from nonpresent to present.
+ * Note: the sptep being assigned *must* be either not present
+ * or in a state where the hardware will not attempt to update
+ * the spte.
+ */
+static void mmu_spte_set(u64 *sptep, u64 new_spte)
+{
+ WARN_ON(is_shadow_present_pte(*sptep));
+ __set_spte(sptep, new_spte);
+}
+
+/* Rules for using mmu_spte_update:
+ * Update the state bits, it means the mapped pfn is not changged.
+ */
+static void mmu_spte_update(u64 *sptep, u64 new_spte)
{
u64 mask, old_spte = *sptep;
WARN_ON(!is_rmap_spte(new_spte));
+ if (!is_shadow_present_pte(old_spte))
+ return mmu_spte_set(sptep, new_spte);
+
new_spte |= old_spte & shadow_dirty_mask;
mask = shadow_accessed_mask;
@@ -335,9 +505,9 @@ static void update_spte(u64 *sptep, u64 new_spte)
mask |= shadow_dirty_mask;
if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
- __set_spte(sptep, new_spte);
+ __update_clear_spte_fast(sptep, new_spte);
else
- old_spte = __xchg_spte(sptep, new_spte);
+ old_spte = __update_clear_spte_slow(sptep, new_spte);
if (!shadow_accessed_mask)
return;
@@ -348,6 +518,64 @@ static void update_spte(u64 *sptep, u64 new_spte)
kvm_set_pfn_dirty(spte_to_pfn(old_spte));
}
+/*
+ * Rules for using mmu_spte_clear_track_bits:
+ * It sets the sptep from present to nonpresent, and track the
+ * state bits, it is used to clear the last level sptep.
+ */
+static int mmu_spte_clear_track_bits(u64 *sptep)
+{
+ pfn_t pfn;
+ u64 old_spte = *sptep;
+
+ if (!spte_has_volatile_bits(old_spte))
+ __update_clear_spte_fast(sptep, 0ull);
+ else
+ old_spte = __update_clear_spte_slow(sptep, 0ull);
+
+ if (!is_rmap_spte(old_spte))
+ return 0;
+
+ pfn = spte_to_pfn(old_spte);
+ if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
+ kvm_set_pfn_accessed(pfn);
+ if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
+ kvm_set_pfn_dirty(pfn);
+ return 1;
+}
+
+/*
+ * Rules for using mmu_spte_clear_no_track:
+ * Directly clear spte without caring the state bits of sptep,
+ * it is used to set the upper level spte.
+ */
+static void mmu_spte_clear_no_track(u64 *sptep)
+{
+ __update_clear_spte_fast(sptep, 0ull);
+}
+
+static u64 mmu_spte_get_lockless(u64 *sptep)
+{
+ return __get_spte_lockless(sptep);
+}
+
+static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
+{
+ rcu_read_lock();
+ atomic_inc(&vcpu->kvm->arch.reader_counter);
+
+ /* Increase the counter before walking shadow page table */
+ smp_mb__after_atomic_inc();
+}
+
+static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
+{
+ /* Decrease the counter after walking shadow page table finished */
+ smp_mb__before_atomic_dec();
+ atomic_dec(&vcpu->kvm->arch.reader_counter);
+ rcu_read_unlock();
+}
+
static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
struct kmem_cache *base_cache, int min)
{
@@ -397,12 +625,8 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
{
int r;
- r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
- pte_chain_cache, 4);
- if (r)
- goto out;
- r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
- rmap_desc_cache, 4 + PTE_PREFETCH_NUM);
+ r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
+ pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
if (r)
goto out;
r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
@@ -416,8 +640,8 @@ out:
static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
{
- mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache);
- mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache);
+ mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
+ pte_list_desc_cache);
mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
mmu_page_header_cache);
@@ -433,26 +657,15 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
return p;
}
-static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
-{
- return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
- sizeof(struct kvm_pte_chain));
-}
-
-static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
+static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
{
- kmem_cache_free(pte_chain_cache, pc);
+ return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache,
+ sizeof(struct pte_list_desc));
}
-static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
+static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
{
- return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
- sizeof(struct kvm_rmap_desc));
-}
-
-static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
-{
- kmem_cache_free(rmap_desc_cache, rd);
+ kmem_cache_free(pte_list_desc_cache, pte_list_desc);
}
static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
@@ -498,6 +711,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
linfo = lpage_info_slot(gfn, slot, i);
linfo->write_count += 1;
}
+ kvm->arch.indirect_shadow_pages++;
}
static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
@@ -513,6 +727,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
linfo->write_count -= 1;
WARN_ON(linfo->write_count < 0);
}
+ kvm->arch.indirect_shadow_pages--;
}
static int has_wrprotected_page(struct kvm *kvm,
@@ -565,7 +780,7 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
{
- return gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true);
+ return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true);
}
static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
@@ -588,67 +803,42 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
}
/*
- * Take gfn and return the reverse mapping to it.
- */
-
-static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
-{
- struct kvm_memory_slot *slot;
- struct kvm_lpage_info *linfo;
-
- slot = gfn_to_memslot(kvm, gfn);
- if (likely(level == PT_PAGE_TABLE_LEVEL))
- return &slot->rmap[gfn - slot->base_gfn];
-
- linfo = lpage_info_slot(gfn, slot, level);
-
- return &linfo->rmap_pde;
-}
-
-/*
- * Reverse mapping data structures:
+ * Pte mapping structures:
*
- * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
- * that points to page_address(page).
+ * If pte_list bit zero is zero, then pte_list point to the spte.
*
- * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
- * containing more mappings.
+ * If pte_list bit zero is one, (then pte_list & ~1) points to a struct
+ * pte_list_desc containing more mappings.
*
- * Returns the number of rmap entries before the spte was added or zero if
+ * Returns the number of pte entries before the spte was added or zero if
* the spte was not added.
*
*/
-static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
+static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
+ unsigned long *pte_list)
{
- struct kvm_mmu_page *sp;
- struct kvm_rmap_desc *desc;
- unsigned long *rmapp;
+ struct pte_list_desc *desc;
int i, count = 0;
- if (!is_rmap_spte(*spte))
- return count;
- sp = page_header(__pa(spte));
- kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
- rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
- if (!*rmapp) {
- rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
- *rmapp = (unsigned long)spte;
- } else if (!(*rmapp & 1)) {
- rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
- desc = mmu_alloc_rmap_desc(vcpu);
- desc->sptes[0] = (u64 *)*rmapp;
+ if (!*pte_list) {
+ rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
+ *pte_list = (unsigned long)spte;
+ } else if (!(*pte_list & 1)) {
+ rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
+ desc = mmu_alloc_pte_list_desc(vcpu);
+ desc->sptes[0] = (u64 *)*pte_list;
desc->sptes[1] = spte;
- *rmapp = (unsigned long)desc | 1;
+ *pte_list = (unsigned long)desc | 1;
++count;
} else {
- rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
- desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
- while (desc->sptes[RMAP_EXT-1] && desc->more) {
+ rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
+ desc = (struct pte_list_desc *)(*pte_list & ~1ul);
+ while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
desc = desc->more;
- count += RMAP_EXT;
+ count += PTE_LIST_EXT;
}
- if (desc->sptes[RMAP_EXT-1]) {
- desc->more = mmu_alloc_rmap_desc(vcpu);
+ if (desc->sptes[PTE_LIST_EXT-1]) {
+ desc->more = mmu_alloc_pte_list_desc(vcpu);
desc = desc->more;
}
for (i = 0; desc->sptes[i]; ++i)
@@ -658,59 +848,78 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
return count;
}
-static void rmap_desc_remove_entry(unsigned long *rmapp,
- struct kvm_rmap_desc *desc,
- int i,
- struct kvm_rmap_desc *prev_desc)
+static u64 *pte_list_next(unsigned long *pte_list, u64 *spte)
+{
+ struct pte_list_desc *desc;
+ u64 *prev_spte;
+ int i;
+
+ if (!*pte_list)
+ return NULL;
+ else if (!(*pte_list & 1)) {
+ if (!spte)
+ return (u64 *)*pte_list;
+ return NULL;
+ }
+ desc = (struct pte_list_desc *)(*pte_list & ~1ul);
+ prev_spte = NULL;
+ while (desc) {
+ for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
+ if (prev_spte == spte)
+ return desc->sptes[i];
+ prev_spte = desc->sptes[i];
+ }
+ desc = desc->more;
+ }
+ return NULL;
+}
+
+static void
+pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc,
+ int i, struct pte_list_desc *prev_desc)
{
int j;
- for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j)
+ for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
;
desc->sptes[i] = desc->sptes[j];
desc->sptes[j] = NULL;
if (j != 0)
return;
if (!prev_desc && !desc->more)
- *rmapp = (unsigned long)desc->sptes[0];
+ *pte_list = (unsigned long)desc->sptes[0];
else
if (prev_desc)
prev_desc->more = desc->more;
else
- *rmapp = (unsigned long)desc->more | 1;
- mmu_free_rmap_desc(desc);
+ *pte_list = (unsigned long)desc->more | 1;
+ mmu_free_pte_list_desc(desc);
}
-static void rmap_remove(struct kvm *kvm, u64 *spte)
+static void pte_list_remove(u64 *spte, unsigned long *pte_list)
{
- struct kvm_rmap_desc *desc;
- struct kvm_rmap_desc *prev_desc;
- struct kvm_mmu_page *sp;
- gfn_t gfn;
- unsigned long *rmapp;
+ struct pte_list_desc *desc;
+ struct pte_list_desc *prev_desc;
int i;
- sp = page_header(__pa(spte));
- gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
- rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
- if (!*rmapp) {
- printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte);
+ if (!*pte_list) {
+ printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte);
BUG();
- } else if (!(*rmapp & 1)) {
- rmap_printk("rmap_remove: %p 1->0\n", spte);
- if ((u64 *)*rmapp != spte) {
- printk(KERN_ERR "rmap_remove: %p 1->BUG\n", spte);
+ } else if (!(*pte_list & 1)) {
+ rmap_printk("pte_list_remove: %p 1->0\n", spte);
+ if ((u64 *)*pte_list != spte) {
+ printk(KERN_ERR "pte_list_remove: %p 1->BUG\n", spte);
BUG();
}
- *rmapp = 0;
+ *pte_list = 0;
} else {
- rmap_printk("rmap_remove: %p many->many\n", spte);
- desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
+ rmap_printk("pte_list_remove: %p many->many\n", spte);
+ desc = (struct pte_list_desc *)(*pte_list & ~1ul);
prev_desc = NULL;
while (desc) {
- for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i)
+ for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
if (desc->sptes[i] == spte) {
- rmap_desc_remove_entry(rmapp,
+ pte_list_desc_remove_entry(pte_list,
desc, i,
prev_desc);
return;
@@ -718,62 +927,80 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
prev_desc = desc;
desc = desc->more;
}
- pr_err("rmap_remove: %p many->many\n", spte);
+ pr_err("pte_list_remove: %p many->many\n", spte);
BUG();
}
}
-static int set_spte_track_bits(u64 *sptep, u64 new_spte)
+typedef void (*pte_list_walk_fn) (u64 *spte);
+static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
{
- pfn_t pfn;
- u64 old_spte = *sptep;
+ struct pte_list_desc *desc;
+ int i;
- if (!spte_has_volatile_bits(old_spte))
- __set_spte(sptep, new_spte);
- else
- old_spte = __xchg_spte(sptep, new_spte);
+ if (!*pte_list)
+ return;
- if (!is_rmap_spte(old_spte))
- return 0;
+ if (!(*pte_list & 1))
+ return fn((u64 *)*pte_list);
- pfn = spte_to_pfn(old_spte);
- if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
- kvm_set_pfn_accessed(pfn);
- if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
- kvm_set_pfn_dirty(pfn);
- return 1;
+ desc = (struct pte_list_desc *)(*pte_list & ~1ul);
+ while (desc) {
+ for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
+ fn(desc->sptes[i]);
+ desc = desc->more;
+ }
}
-static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
+/*
+ * Take gfn and return the reverse mapping to it.
+ */
+static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
{
- if (set_spte_track_bits(sptep, new_spte))
- rmap_remove(kvm, sptep);
+ struct kvm_memory_slot *slot;
+ struct kvm_lpage_info *linfo;
+
+ slot = gfn_to_memslot(kvm, gfn);
+ if (likely(level == PT_PAGE_TABLE_LEVEL))
+ return &slot->rmap[gfn - slot->base_gfn];
+
+ linfo = lpage_info_slot(gfn, slot, level);
+
+ return &linfo->rmap_pde;
+}
+
+static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
+{
+ struct kvm_mmu_page *sp;
+ unsigned long *rmapp;
+
+ sp = page_header(__pa(spte));
+ kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
+ rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
+ return pte_list_add(vcpu, spte, rmapp);
}
static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
{
- struct kvm_rmap_desc *desc;
- u64 *prev_spte;
- int i;
+ return pte_list_next(rmapp, spte);
+}
- if (!*rmapp)
- return NULL;
- else if (!(*rmapp & 1)) {
- if (!spte)
- return (u64 *)*rmapp;
- return NULL;
- }
- desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
- prev_spte = NULL;
- while (desc) {
- for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
- if (prev_spte == spte)
- return desc->sptes[i];
- prev_spte = desc->sptes[i];
- }
- desc = desc->more;
- }
- return NULL;
+static void rmap_remove(struct kvm *kvm, u64 *spte)
+{
+ struct kvm_mmu_page *sp;
+ gfn_t gfn;
+ unsigned long *rmapp;
+
+ sp = page_header(__pa(spte));
+ gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
+ rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
+ pte_list_remove(spte, rmapp);
+}
+
+static void drop_spte(struct kvm *kvm, u64 *sptep)
+{
+ if (mmu_spte_clear_track_bits(sptep))
+ rmap_remove(kvm, sptep);
}
static int rmap_write_protect(struct kvm *kvm, u64 gfn)
@@ -790,7 +1017,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
BUG_ON(!(*spte & PT_PRESENT_MASK));
rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
if (is_writable_pte(*spte)) {
- update_spte(spte, *spte & ~PT_WRITABLE_MASK);
+ mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK);
write_protected = 1;
}
spte = rmap_next(kvm, rmapp, spte);
@@ -807,8 +1034,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
if (is_writable_pte(*spte)) {
- drop_spte(kvm, spte,
- shadow_trap_nonpresent_pte);
+ drop_spte(kvm, spte);
--kvm->stat.lpages;
spte = NULL;
write_protected = 1;
@@ -829,7 +1055,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
while ((spte = rmap_next(kvm, rmapp, NULL))) {
BUG_ON(!(*spte & PT_PRESENT_MASK));
rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
- drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
+ drop_spte(kvm, spte);
need_tlb_flush = 1;
}
return need_tlb_flush;
@@ -851,7 +1077,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
need_flush = 1;
if (pte_write(*ptep)) {
- drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
+ drop_spte(kvm, spte);
spte = rmap_next(kvm, rmapp, NULL);
} else {
new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
@@ -860,7 +1086,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
new_spte &= ~PT_WRITABLE_MASK;
new_spte &= ~SPTE_HOST_WRITEABLE;
new_spte &= ~shadow_accessed_mask;
- set_spte_track_bits(spte, new_spte);
+ mmu_spte_clear_track_bits(spte);
+ mmu_spte_set(spte, new_spte);
spte = rmap_next(kvm, rmapp, spte);
}
}
@@ -1032,151 +1259,89 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
percpu_counter_add(&kvm_total_used_mmu_pages, nr);
}
-static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+/*
+ * Remove the sp from shadow page cache, after call it,
+ * we can not find this sp from the cache, and the shadow
+ * page table is still valid.
+ * It should be under the protection of mmu lock.
+ */
+static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp)
{
ASSERT(is_empty_shadow_page(sp->spt));
hlist_del(&sp->hash_link);
- list_del(&sp->link);
- free_page((unsigned long)sp->spt);
if (!sp->role.direct)
free_page((unsigned long)sp->gfns);
- kmem_cache_free(mmu_page_header_cache, sp);
- kvm_mod_used_mmu_pages(kvm, -1);
}
-static unsigned kvm_page_table_hashfn(gfn_t gfn)
+/*
+ * Free the shadow page table and the sp, we can do it
+ * out of the protection of mmu lock.
+ */
+static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
{
- return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
+ list_del(&sp->link);
+ free_page((unsigned long)sp->spt);
+ kmem_cache_free(mmu_page_header_cache, sp);
}
-static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
- u64 *parent_pte, int direct)
+static unsigned kvm_page_table_hashfn(gfn_t gfn)
{
- struct kvm_mmu_page *sp;
-
- sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
- sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
- if (!direct)
- sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
- PAGE_SIZE);
- set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
- list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
- bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
- sp->multimapped = 0;
- sp->parent_pte = parent_pte;
- kvm_mod_used_mmu_pages(vcpu->kvm, +1);
- return sp;
+ return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
}
static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp, u64 *parent_pte)
{
- struct kvm_pte_chain *pte_chain;
- struct hlist_node *node;
- int i;
-
if (!parent_pte)
return;
- if (!sp->multimapped) {
- u64 *old = sp->parent_pte;
- if (!old) {
- sp->parent_pte = parent_pte;
- return;
- }
- sp->multimapped = 1;
- pte_chain = mmu_alloc_pte_chain(vcpu);
- INIT_HLIST_HEAD(&sp->parent_ptes);
- hlist_add_head(&pte_chain->link, &sp->parent_ptes);
- pte_chain->parent_ptes[0] = old;
- }
- hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
- if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
- continue;
- for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
- if (!pte_chain->parent_ptes[i]) {
- pte_chain->parent_ptes[i] = parent_pte;
- return;
- }
- }
- pte_chain = mmu_alloc_pte_chain(vcpu);
- BUG_ON(!pte_chain);
- hlist_add_head(&pte_chain->link, &sp->parent_ptes);
- pte_chain->parent_ptes[0] = parent_pte;
+ pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
}
static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
u64 *parent_pte)
{
- struct kvm_pte_chain *pte_chain;
- struct hlist_node *node;
- int i;
-
- if (!sp->multimapped) {
- BUG_ON(sp->parent_pte != parent_pte);
- sp->parent_pte = NULL;
- return;
- }
- hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
- for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
- if (!pte_chain->parent_ptes[i])
- break;
- if (pte_chain->parent_ptes[i] != parent_pte)
- continue;
- while (i + 1 < NR_PTE_CHAIN_ENTRIES
- && pte_chain->parent_ptes[i + 1]) {
- pte_chain->parent_ptes[i]
- = pte_chain->parent_ptes[i + 1];
- ++i;
- }
- pte_chain->parent_ptes[i] = NULL;
- if (i == 0) {
- hlist_del(&pte_chain->link);
- mmu_free_pte_chain(pte_chain);
- if (hlist_empty(&sp->parent_ptes)) {
- sp->multimapped = 0;
- sp->parent_pte = NULL;
- }
- }
- return;
- }
- BUG();
+ pte_list_remove(parent_pte, &sp->parent_ptes);
}
-static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
+static void drop_parent_pte(struct kvm_mmu_page *sp,
+ u64 *parent_pte)
{
- struct kvm_pte_chain *pte_chain;
- struct hlist_node *node;
- struct kvm_mmu_page *parent_sp;
- int i;
-
- if (!sp->multimapped && sp->parent_pte) {
- parent_sp = page_header(__pa(sp->parent_pte));
- fn(parent_sp, sp->parent_pte);
- return;
- }
-
- hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
- for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
- u64 *spte = pte_chain->parent_ptes[i];
+ mmu_page_remove_parent_pte(sp, parent_pte);
+ mmu_spte_clear_no_track(parent_pte);
+}
- if (!spte)
- break;
- parent_sp = page_header(__pa(spte));
- fn(parent_sp, spte);
- }
+static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
+ u64 *parent_pte, int direct)
+{
+ struct kvm_mmu_page *sp;
+ sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache,
+ sizeof *sp);
+ sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
+ if (!direct)
+ sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
+ PAGE_SIZE);
+ set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+ list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
+ bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
+ sp->parent_ptes = 0;
+ mmu_page_add_parent_pte(vcpu, sp, parent_pte);
+ kvm_mod_used_mmu_pages(vcpu->kvm, +1);
+ return sp;
}
-static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte);
+static void mark_unsync(u64 *spte);
static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
{
- mmu_parent_walk(sp, mark_unsync);
+ pte_list_walk(&sp->parent_ptes, mark_unsync);
}
-static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
+static void mark_unsync(u64 *spte)
{
+ struct kvm_mmu_page *sp;
unsigned int index;
+ sp = page_header(__pa(spte));
index = spte - sp->spt;
if (__test_and_set_bit(index, sp->unsync_child_bitmap))
return;
@@ -1185,15 +1350,6 @@ static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
kvm_mmu_mark_parents_unsync(sp);
}
-static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp)
-{
- int i;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
- sp->spt[i] = shadow_trap_nonpresent_pte;
-}
-
static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp)
{
@@ -1206,7 +1362,7 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp, u64 *spte,
- const void *pte, unsigned long mmu_seq)
+ const void *pte)
{
WARN_ON(1);
}
@@ -1475,6 +1631,14 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
}
}
+static void init_shadow_page_table(struct kvm_mmu_page *sp)
+{
+ int i;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
+ sp->spt[i] = 0ull;
+}
+
static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
gfn_t gfn,
gva_t gaddr,
@@ -1537,10 +1701,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
account_shadowed(vcpu->kvm, gfn);
}
- if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
- vcpu->arch.mmu.prefetch_page(vcpu, sp);
- else
- nonpaging_prefetch_page(vcpu, sp);
+ init_shadow_page_table(sp);
trace_kvm_mmu_get_page(sp, true);
return sp;
}
@@ -1572,21 +1733,28 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
if (iterator->level < PT_PAGE_TABLE_LEVEL)
return false;
- if (iterator->level == PT_PAGE_TABLE_LEVEL)
- if (is_large_pte(*iterator->sptep))
- return false;
-
iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
return true;
}
-static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
+static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
+ u64 spte)
{
- iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK;
+ if (is_last_spte(spte, iterator->level)) {
+ iterator->level = 0;
+ return;
+ }
+
+ iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
--iterator->level;
}
+static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
+{
+ return __shadow_walk_next(iterator, *iterator->sptep);
+}
+
static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
{
u64 spte;
@@ -1594,13 +1762,13 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
spte = __pa(sp->spt)
| PT_PRESENT_MASK | PT_ACCESSED_MASK
| PT_WRITABLE_MASK | PT_USER_MASK;
- __set_spte(sptep, spte);
+ mmu_spte_set(sptep, spte);
}
static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
{
if (is_large_pte(*sptep)) {
- drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+ drop_spte(vcpu->kvm, sptep);
kvm_flush_remote_tlbs(vcpu->kvm);
}
}
@@ -1622,38 +1790,39 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (child->role.access == direct_access)
return;
- mmu_page_remove_parent_pte(child, sptep);
- __set_spte(sptep, shadow_trap_nonpresent_pte);
+ drop_parent_pte(child, sptep);
kvm_flush_remote_tlbs(vcpu->kvm);
}
}
+static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
+ u64 *spte)
+{
+ u64 pte;
+ struct kvm_mmu_page *child;
+
+ pte = *spte;
+ if (is_shadow_present_pte(pte)) {
+ if (is_last_spte(pte, sp->role.level))
+ drop_spte(kvm, spte);
+ else {
+ child = page_header(pte & PT64_BASE_ADDR_MASK);
+ drop_parent_pte(child, spte);
+ }
+ } else if (is_mmio_spte(pte))
+ mmu_spte_clear_no_track(spte);
+
+ if (is_large_pte(pte))
+ --kvm->stat.lpages;
+}
+
static void kvm_mmu_page_unlink_children(struct kvm *kvm,
struct kvm_mmu_page *sp)
{
unsigned i;
- u64 *pt;
- u64 ent;
-
- pt = sp->spt;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- ent = pt[i];
-
- if (is_shadow_present_pte(ent)) {
- if (!is_last_spte(ent, sp->role.level)) {
- ent &= PT64_BASE_ADDR_MASK;
- mmu_page_remove_parent_pte(page_header(ent),
- &pt[i]);
- } else {
- if (is_large_pte(ent))
- --kvm->stat.lpages;
- drop_spte(kvm, &pt[i],
- shadow_trap_nonpresent_pte);
- }
- }
- pt[i] = shadow_trap_nonpresent_pte;
- }
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
+ mmu_page_zap_pte(kvm, sp, sp->spt + i);
}
static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
@@ -1674,20 +1843,8 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
{
u64 *parent_pte;
- while (sp->multimapped || sp->parent_pte) {
- if (!sp->multimapped)
- parent_pte = sp->parent_pte;
- else {
- struct kvm_pte_chain *chain;
-
- chain = container_of(sp->parent_ptes.first,
- struct kvm_pte_chain, link);
- parent_pte = chain->parent_ptes[0];
- }
- BUG_ON(!parent_pte);
- kvm_mmu_put_page(sp, parent_pte);
- __set_spte(parent_pte, shadow_trap_nonpresent_pte);
- }
+ while ((parent_pte = pte_list_next(&sp->parent_ptes, NULL)))
+ drop_parent_pte(sp, parent_pte);
}
static int mmu_zap_unsync_children(struct kvm *kvm,
@@ -1734,6 +1891,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
/* Count self */
ret++;
list_move(&sp->link, invalid_list);
+ kvm_mod_used_mmu_pages(kvm, -1);
} else {
list_move(&sp->link, &kvm->arch.active_mmu_pages);
kvm_reload_remote_mmus(kvm);
@@ -1744,6 +1902,30 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
return ret;
}
+static void kvm_mmu_isolate_pages(struct list_head *invalid_list)
+{
+ struct kvm_mmu_page *sp;
+
+ list_for_each_entry(sp, invalid_list, link)
+ kvm_mmu_isolate_page(sp);
+}
+
+static void free_pages_rcu(struct rcu_head *head)
+{
+ struct kvm_mmu_page *next, *sp;
+
+ sp = container_of(head, struct kvm_mmu_page, rcu);
+ while (sp) {
+ if (!list_empty(&sp->link))
+ next = list_first_entry(&sp->link,
+ struct kvm_mmu_page, link);
+ else
+ next = NULL;
+ kvm_mmu_free_page(sp);
+ sp = next;
+ }
+}
+
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
struct list_head *invalid_list)
{
@@ -1754,10 +1936,21 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
kvm_flush_remote_tlbs(kvm);
+ if (atomic_read(&kvm->arch.reader_counter)) {
+ kvm_mmu_isolate_pages(invalid_list);
+ sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
+ list_del_init(invalid_list);
+
+ trace_kvm_mmu_delay_free_pages(sp);
+ call_rcu(&sp->rcu, free_pages_rcu);
+ return;
+ }
+
do {
sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
WARN_ON(!sp->role.invalid || sp->root_count);
- kvm_mmu_free_page(kvm, sp);
+ kvm_mmu_isolate_page(sp);
+ kvm_mmu_free_page(sp);
} while (!list_empty(invalid_list));
}
@@ -1783,8 +1976,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
page = container_of(kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
- kvm_mmu_commit_zap_page(kvm, &invalid_list);
}
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
}
@@ -1833,20 +2026,6 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
__set_bit(slot, sp->slot_bitmap);
}
-static void mmu_convert_notrap(struct kvm_mmu_page *sp)
-{
- int i;
- u64 *pt = sp->spt;
-
- if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
- return;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- if (pt[i] == shadow_notrap_nonpresent_pte)
- __set_spte(&pt[i], shadow_trap_nonpresent_pte);
- }
-}
-
/*
* The function is based on mtrr_type_lookup() in
* arch/x86/kernel/cpu/mtrr/generic.c
@@ -1959,7 +2138,6 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
sp->unsync = 1;
kvm_mmu_mark_parents_unsync(sp);
- mmu_convert_notrap(sp);
}
static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
@@ -2002,13 +2180,16 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pte_access, int user_fault,
- int write_fault, int dirty, int level,
+ int write_fault, int level,
gfn_t gfn, pfn_t pfn, bool speculative,
bool can_unsync, bool host_writable)
{
u64 spte, entry = *sptep;
int ret = 0;
+ if (set_mmio_spte(sptep, gfn, pfn, pte_access))
+ return 0;
+
/*
* We don't set the accessed bit, since we sometimes want to see
* whether the guest actually used the pte (in order to detect
@@ -2017,8 +2198,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte = PT_PRESENT_MASK;
if (!speculative)
spte |= shadow_accessed_mask;
- if (!dirty)
- pte_access &= ~ACC_WRITE_MASK;
+
if (pte_access & ACC_EXEC_MASK)
spte |= shadow_x_mask;
else
@@ -2045,15 +2225,24 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (level > PT_PAGE_TABLE_LEVEL &&
has_wrprotected_page(vcpu->kvm, gfn, level)) {
ret = 1;
- drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+ drop_spte(vcpu->kvm, sptep);
goto done;
}
spte |= PT_WRITABLE_MASK;
if (!vcpu->arch.mmu.direct_map
- && !(pte_access & ACC_WRITE_MASK))
+ && !(pte_access & ACC_WRITE_MASK)) {
spte &= ~PT_USER_MASK;
+ /*
+ * If we converted a user page to a kernel page,
+ * so that the kernel can write to it when cr0.wp=0,
+ * then we should prevent the kernel from executing it
+ * if SMEP is enabled.
+ */
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
+ spte |= PT64_NX_MASK;
+ }
/*
* Optimization: for pte sync, if spte was writable the hash
@@ -2078,7 +2267,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
mark_page_dirty(vcpu->kvm, gfn);
set_pte:
- update_spte(sptep, spte);
+ mmu_spte_update(sptep, spte);
/*
* If we overwrite a writable spte with a read-only one we
* should flush remote TLBs. Otherwise rmap_write_protect
@@ -2093,8 +2282,8 @@ done:
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pt_access, unsigned pte_access,
- int user_fault, int write_fault, int dirty,
- int *ptwrite, int level, gfn_t gfn,
+ int user_fault, int write_fault,
+ int *emulate, int level, gfn_t gfn,
pfn_t pfn, bool speculative,
bool host_writable)
{
@@ -2117,26 +2306,28 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
u64 pte = *sptep;
child = page_header(pte & PT64_BASE_ADDR_MASK);
- mmu_page_remove_parent_pte(child, sptep);
- __set_spte(sptep, shadow_trap_nonpresent_pte);
+ drop_parent_pte(child, sptep);
kvm_flush_remote_tlbs(vcpu->kvm);
} else if (pfn != spte_to_pfn(*sptep)) {
pgprintk("hfn old %llx new %llx\n",
spte_to_pfn(*sptep), pfn);
- drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+ drop_spte(vcpu->kvm, sptep);
kvm_flush_remote_tlbs(vcpu->kvm);
} else
was_rmapped = 1;
}
if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
- dirty, level, gfn, pfn, speculative, true,
+ level, gfn, pfn, speculative, true,
host_writable)) {
if (write_fault)
- *ptwrite = 1;
+ *emulate = 1;
kvm_mmu_flush_tlb(vcpu);
}
+ if (unlikely(is_mmio_spte(*sptep) && emulate))
+ *emulate = 1;
+
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
is_large_pte(*sptep)? "2MB" : "4kB",
@@ -2145,11 +2336,13 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (!was_rmapped && is_large_pte(*sptep))
++vcpu->kvm->stat.lpages;
- page_header_update_slot(vcpu->kvm, sptep, gfn);
- if (!was_rmapped) {
- rmap_count = rmap_add(vcpu, sptep, gfn);
- if (rmap_count > RMAP_RECYCLE_THRESHOLD)
- rmap_recycle(vcpu, sptep, gfn);
+ if (is_shadow_present_pte(*sptep)) {
+ page_header_update_slot(vcpu->kvm, sptep, gfn);
+ if (!was_rmapped) {
+ rmap_count = rmap_add(vcpu, sptep, gfn);
+ if (rmap_count > RMAP_RECYCLE_THRESHOLD)
+ rmap_recycle(vcpu, sptep, gfn);
+ }
}
kvm_release_pfn_clean(pfn);
if (speculative) {
@@ -2170,8 +2363,8 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
if (!slot) {
- get_page(bad_page);
- return page_to_pfn(bad_page);
+ get_page(fault_page);
+ return page_to_pfn(fault_page);
}
hva = gfn_to_hva_memslot(slot, gfn);
@@ -2198,7 +2391,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
for (i = 0; i < ret; i++, gfn++, start++)
mmu_set_spte(vcpu, start, ACC_ALL,
- access, 0, 0, 1, NULL,
+ access, 0, 0, NULL,
sp->role.level, gfn,
page_to_pfn(pages[i]), true, true);
@@ -2217,7 +2410,7 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
spte = sp->spt + i;
for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
- if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
+ if (is_shadow_present_pte(*spte) || spte == sptep) {
if (!start)
continue;
if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
@@ -2254,7 +2447,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
{
struct kvm_shadow_walk_iterator iterator;
struct kvm_mmu_page *sp;
- int pt_write = 0;
+ int emulate = 0;
gfn_t pseudo_gfn;
for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
@@ -2262,14 +2455,14 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
unsigned pte_access = ACC_ALL;
mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
- 0, write, 1, &pt_write,
+ 0, write, &emulate,
level, gfn, pfn, prefault, map_writable);
direct_pte_prefetch(vcpu, iterator.sptep);
++vcpu->stat.pf_fixed;
break;
}
- if (*iterator.sptep == shadow_trap_nonpresent_pte) {
+ if (!is_shadow_present_pte(*iterator.sptep)) {
u64 base_addr = iterator.addr;
base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
@@ -2283,14 +2476,14 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
return -ENOMEM;
}
- __set_spte(iterator.sptep,
- __pa(sp->spt)
- | PT_PRESENT_MASK | PT_WRITABLE_MASK
- | shadow_user_mask | shadow_x_mask
- | shadow_accessed_mask);
+ mmu_spte_set(iterator.sptep,
+ __pa(sp->spt)
+ | PT_PRESENT_MASK | PT_WRITABLE_MASK
+ | shadow_user_mask | shadow_x_mask
+ | shadow_accessed_mask);
}
}
- return pt_write;
+ return emulate;
}
static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
@@ -2306,16 +2499,15 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
send_sig_info(SIGBUS, &info, tsk);
}
-static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
+static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
{
kvm_release_pfn_clean(pfn);
if (is_hwpoison_pfn(pfn)) {
- kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current);
+ kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
return 0;
- } else if (is_fault_pfn(pfn))
- return -EFAULT;
+ }
- return 1;
+ return -EFAULT;
}
static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
@@ -2360,6 +2552,30 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
}
}
+static bool mmu_invalid_pfn(pfn_t pfn)
+{
+ return unlikely(is_invalid_pfn(pfn));
+}
+
+static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
+ pfn_t pfn, unsigned access, int *ret_val)
+{
+ bool ret = true;
+
+ /* The pfn is invalid, report the error! */
+ if (unlikely(is_invalid_pfn(pfn))) {
+ *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
+ goto exit;
+ }
+
+ if (unlikely(is_noslot_pfn(pfn)))
+ vcpu_cache_mmio_info(vcpu, gva, gfn, access);
+
+ ret = false;
+exit:
+ return ret;
+}
+
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
gva_t gva, pfn_t *pfn, bool write, bool *writable);
@@ -2394,9 +2610,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
return 0;
- /* mmio */
- if (is_error_pfn(pfn))
- return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
+ if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
+ return r;
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -2623,6 +2838,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
+ vcpu_clear_mmio_info(vcpu, ~0ul);
trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
@@ -2667,6 +2883,94 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
}
+static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+{
+ if (direct)
+ return vcpu_match_mmio_gpa(vcpu, addr);
+
+ return vcpu_match_mmio_gva(vcpu, addr);
+}
+
+
+/*
+ * On direct hosts, the last spte is only allows two states
+ * for mmio page fault:
+ * - It is the mmio spte
+ * - It is zapped or it is being zapped.
+ *
+ * This function completely checks the spte when the last spte
+ * is not the mmio spte.
+ */
+static bool check_direct_spte_mmio_pf(u64 spte)
+{
+ return __check_direct_spte_mmio_pf(spte);
+}
+
+static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
+{
+ struct kvm_shadow_walk_iterator iterator;
+ u64 spte = 0ull;
+
+ walk_shadow_page_lockless_begin(vcpu);
+ for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
+ if (!is_shadow_present_pte(spte))
+ break;
+ walk_shadow_page_lockless_end(vcpu);
+
+ return spte;
+}
+
+/*
+ * If it is a real mmio page fault, return 1 and emulat the instruction
+ * directly, return 0 to let CPU fault again on the address, -1 is
+ * returned if bug is detected.
+ */
+int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+{
+ u64 spte;
+
+ if (quickly_check_mmio_pf(vcpu, addr, direct))
+ return 1;
+
+ spte = walk_shadow_page_get_mmio_spte(vcpu, addr);
+
+ if (is_mmio_spte(spte)) {
+ gfn_t gfn = get_mmio_spte_gfn(spte);
+ unsigned access = get_mmio_spte_access(spte);
+
+ if (direct)
+ addr = 0;
+
+ trace_handle_mmio_page_fault(addr, gfn, access);
+ vcpu_cache_mmio_info(vcpu, addr, gfn, access);
+ return 1;
+ }
+
+ /*
+ * It's ok if the gva is remapped by other cpus on shadow guest,
+ * it's a BUG if the gfn is not a mmio page.
+ */
+ if (direct && !check_direct_spte_mmio_pf(spte))
+ return -1;
+
+ /*
+ * If the page table is zapped by other cpus, let CPU fault again on
+ * the address.
+ */
+ return 0;
+}
+EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common);
+
+static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr,
+ u32 error_code, bool direct)
+{
+ int ret;
+
+ ret = handle_mmio_page_fault_common(vcpu, addr, direct);
+ WARN_ON(ret < 0);
+ return ret;
+}
+
static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
u32 error_code, bool prefault)
{
@@ -2674,6 +2978,10 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
int r;
pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
+
+ if (unlikely(error_code & PFERR_RSVD_MASK))
+ return handle_mmio_page_fault(vcpu, gva, error_code, true);
+
r = mmu_topup_memory_caches(vcpu);
if (r)
return r;
@@ -2750,6 +3058,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
ASSERT(vcpu);
ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+ if (unlikely(error_code & PFERR_RSVD_MASK))
+ return handle_mmio_page_fault(vcpu, gpa, error_code, true);
+
r = mmu_topup_memory_caches(vcpu);
if (r)
return r;
@@ -2767,9 +3078,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
return 0;
- /* mmio */
- if (is_error_pfn(pfn))
- return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
+ if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
+ return r;
+
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
@@ -2800,7 +3111,6 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu,
context->page_fault = nonpaging_page_fault;
context->gva_to_gpa = nonpaging_gva_to_gpa;
context->free = nonpaging_free;
- context->prefetch_page = nonpaging_prefetch_page;
context->sync_page = nonpaging_sync_page;
context->invlpg = nonpaging_invlpg;
context->update_pte = nonpaging_update_pte;
@@ -2848,6 +3158,23 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
}
+static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
+ int *nr_present)
+{
+ if (unlikely(is_mmio_spte(*sptep))) {
+ if (gfn != get_mmio_spte_gfn(*sptep)) {
+ mmu_spte_clear_no_track(sptep);
+ return true;
+ }
+
+ (*nr_present)++;
+ mark_mmio_spte(sptep, gfn, access);
+ return true;
+ }
+
+ return false;
+}
+
#define PTTYPE 64
#include "paging_tmpl.h"
#undef PTTYPE
@@ -2930,7 +3257,6 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
context->new_cr3 = paging_new_cr3;
context->page_fault = paging64_page_fault;
context->gva_to_gpa = paging64_gva_to_gpa;
- context->prefetch_page = paging64_prefetch_page;
context->sync_page = paging64_sync_page;
context->invlpg = paging64_invlpg;
context->update_pte = paging64_update_pte;
@@ -2959,7 +3285,6 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
context->page_fault = paging32_page_fault;
context->gva_to_gpa = paging32_gva_to_gpa;
context->free = paging_free;
- context->prefetch_page = paging32_prefetch_page;
context->sync_page = paging32_sync_page;
context->invlpg = paging32_invlpg;
context->update_pte = paging32_update_pte;
@@ -2984,7 +3309,6 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->new_cr3 = nonpaging_new_cr3;
context->page_fault = tdp_page_fault;
context->free = nonpaging_free;
- context->prefetch_page = nonpaging_prefetch_page;
context->sync_page = nonpaging_sync_page;
context->invlpg = nonpaging_invlpg;
context->update_pte = nonpaging_update_pte;
@@ -3023,6 +3347,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
{
int r;
+ bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
ASSERT(vcpu);
ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -3037,6 +3362,8 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
+ vcpu->arch.mmu.base_role.smep_andnot_wp
+ = smep && !is_write_protection(vcpu);
return r;
}
@@ -3141,31 +3468,9 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_mmu_unload);
-static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp,
- u64 *spte)
-{
- u64 pte;
- struct kvm_mmu_page *child;
-
- pte = *spte;
- if (is_shadow_present_pte(pte)) {
- if (is_last_spte(pte, sp->role.level))
- drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
- else {
- child = page_header(pte & PT64_BASE_ADDR_MASK);
- mmu_page_remove_parent_pte(child, spte);
- }
- }
- __set_spte(spte, shadow_trap_nonpresent_pte);
- if (is_large_pte(pte))
- --vcpu->kvm->stat.lpages;
-}
-
static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp,
- u64 *spte,
- const void *new, unsigned long mmu_seq)
+ struct kvm_mmu_page *sp, u64 *spte,
+ const void *new)
{
if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
++vcpu->kvm->stat.mmu_pde_zapped;
@@ -3173,7 +3478,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
}
++vcpu->kvm->stat.mmu_pte_updated;
- vcpu->arch.mmu.update_pte(vcpu, sp, spte, new, mmu_seq);
+ vcpu->arch.mmu.update_pte(vcpu, sp, spte, new);
}
static bool need_remote_flush(u64 old, u64 new)
@@ -3229,12 +3534,18 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
struct kvm_mmu_page *sp;
struct hlist_node *node;
LIST_HEAD(invalid_list);
- unsigned long mmu_seq;
u64 entry, gentry, *spte;
unsigned pte_size, page_offset, misaligned, quadrant, offset;
int level, npte, invlpg_counter, r, flooded = 0;
bool remote_flush, local_flush, zap_page;
+ /*
+ * If we don't have indirect shadow pages, it means no page is
+ * write-protected, so we can exit simply.
+ */
+ if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
+ return;
+
zap_page = remote_flush = local_flush = false;
offset = offset_in_page(gpa);
@@ -3271,9 +3582,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
break;
}
- mmu_seq = vcpu->kvm->mmu_notifier_seq;
- smp_rmb();
-
spin_lock(&vcpu->kvm->mmu_lock);
if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
gentry = 0;
@@ -3341,12 +3649,11 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
spte = &sp->spt[page_offset / sizeof(*spte)];
while (npte--) {
entry = *spte;
- mmu_pte_write_zap_pte(vcpu, sp, spte);
+ mmu_page_zap_pte(vcpu->kvm, sp, spte);
if (gentry &&
!((sp->role.word ^ vcpu->arch.mmu.base_role.word)
& mask.word))
- mmu_pte_write_new_pte(vcpu, sp, spte, &gentry,
- mmu_seq);
+ mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
if (!remote_flush && need_remote_flush(entry, *spte))
remote_flush = true;
++spte;
@@ -3386,9 +3693,9 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
++vcpu->kvm->stat.mmu_recycled;
}
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
}
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
@@ -3512,15 +3819,15 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
continue;
if (is_large_pte(pt[i])) {
- drop_spte(kvm, &pt[i],
- shadow_trap_nonpresent_pte);
+ drop_spte(kvm, &pt[i]);
--kvm->stat.lpages;
continue;
}
/* avoid RMW */
if (is_writable_pte(pt[i]))
- update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK);
+ mmu_spte_update(&pt[i],
+ pt[i] & ~PT_WRITABLE_MASK);
}
}
kvm_flush_remote_tlbs(kvm);
@@ -3551,10 +3858,11 @@ static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
}
-static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
+static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
{
struct kvm *kvm;
struct kvm *kvm_freed = NULL;
+ int nr_to_scan = sc->nr_to_scan;
if (nr_to_scan == 0)
goto out;
@@ -3595,25 +3903,18 @@ static struct shrinker mmu_shrinker = {
static void mmu_destroy_caches(void)
{
- if (pte_chain_cache)
- kmem_cache_destroy(pte_chain_cache);
- if (rmap_desc_cache)
- kmem_cache_destroy(rmap_desc_cache);
+ if (pte_list_desc_cache)
+ kmem_cache_destroy(pte_list_desc_cache);
if (mmu_page_header_cache)
kmem_cache_destroy(mmu_page_header_cache);
}
int kvm_mmu_module_init(void)
{
- pte_chain_cache = kmem_cache_create("kvm_pte_chain",
- sizeof(struct kvm_pte_chain),
- 0, 0, NULL);
- if (!pte_chain_cache)
- goto nomem;
- rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
- sizeof(struct kvm_rmap_desc),
+ pte_list_desc_cache = kmem_cache_create("pte_list_desc",
+ sizeof(struct pte_list_desc),
0, 0, NULL);
- if (!rmap_desc_cache)
+ if (!pte_list_desc_cache)
goto nomem;
mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
@@ -3780,16 +4081,17 @@ out:
int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
{
struct kvm_shadow_walk_iterator iterator;
+ u64 spte;
int nr_sptes = 0;
- spin_lock(&vcpu->kvm->mmu_lock);
- for_each_shadow_entry(vcpu, addr, iterator) {
- sptes[iterator.level-1] = *iterator.sptep;
+ walk_shadow_page_lockless_begin(vcpu);
+ for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
+ sptes[iterator.level-1] = spte;
nr_sptes++;
- if (!is_shadow_present_pte(*iterator.sptep))
+ if (!is_shadow_present_pte(spte))
break;
}
- spin_unlock(&vcpu->kvm->mmu_lock);
+ walk_shadow_page_lockless_end(vcpu);
return nr_sptes;
}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 7086ca85d3e..e374db9af02 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -49,6 +49,8 @@
#define PFERR_FETCH_MASK (1U << 4)
int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
+int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
@@ -76,4 +78,27 @@ static inline int is_present_gpte(unsigned long pte)
return pte & PT_PRESENT_MASK;
}
+static inline int is_writable_pte(unsigned long pte)
+{
+ return pte & PT_WRITABLE_MASK;
+}
+
+static inline bool is_write_protection(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
+}
+
+static inline bool check_write_user_access(struct kvm_vcpu *vcpu,
+ bool write_fault, bool user_fault,
+ unsigned long pte)
+{
+ if (unlikely(write_fault && !is_writable_pte(pte)
+ && (user_fault || is_write_protection(vcpu))))
+ return false;
+
+ if (unlikely(user_fault && !(pte & PT_USER_MASK)))
+ return false;
+
+ return true;
+}
#endif
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 5f6223b8bcf..2460a265be2 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -99,18 +99,6 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
"level = %d\n", sp, level);
return;
}
-
- if (*sptep == shadow_notrap_nonpresent_pte) {
- audit_printk(vcpu->kvm, "notrap spte in unsync "
- "sp: %p\n", sp);
- return;
- }
- }
-
- if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
- audit_printk(vcpu->kvm, "notrap spte in direct sp: %p\n",
- sp);
- return;
}
if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level))
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index b60b4fdb3ed..eed67f34146 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -196,6 +196,54 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
TP_ARGS(sp)
);
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_delay_free_pages,
+ TP_PROTO(struct kvm_mmu_page *sp),
+
+ TP_ARGS(sp)
+);
+
+TRACE_EVENT(
+ mark_mmio_spte,
+ TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access),
+ TP_ARGS(sptep, gfn, access),
+
+ TP_STRUCT__entry(
+ __field(void *, sptep)
+ __field(gfn_t, gfn)
+ __field(unsigned, access)
+ ),
+
+ TP_fast_assign(
+ __entry->sptep = sptep;
+ __entry->gfn = gfn;
+ __entry->access = access;
+ ),
+
+ TP_printk("sptep:%p gfn %llx access %x", __entry->sptep, __entry->gfn,
+ __entry->access)
+);
+
+TRACE_EVENT(
+ handle_mmio_page_fault,
+ TP_PROTO(u64 addr, gfn_t gfn, unsigned access),
+ TP_ARGS(addr, gfn, access),
+
+ TP_STRUCT__entry(
+ __field(u64, addr)
+ __field(gfn_t, gfn)
+ __field(unsigned, access)
+ ),
+
+ TP_fast_assign(
+ __entry->addr = addr;
+ __entry->gfn = gfn;
+ __entry->access = access;
+ ),
+
+ TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn,
+ __entry->access)
+);
+
TRACE_EVENT(
kvm_mmu_audit,
TP_PROTO(struct kvm_vcpu *vcpu, int audit_point),
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index c6397795d86..507e2b844cf 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -78,15 +78,19 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
}
-static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
- gfn_t table_gfn, unsigned index,
- pt_element_t orig_pte, pt_element_t new_pte)
+static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ pt_element_t __user *ptep_user, unsigned index,
+ pt_element_t orig_pte, pt_element_t new_pte)
{
+ int npages;
pt_element_t ret;
pt_element_t *table;
struct page *page;
- page = gfn_to_page(kvm, table_gfn);
+ npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page);
+ /* Check if the user is doing something meaningless. */
+ if (unlikely(npages != 1))
+ return -EFAULT;
table = kmap_atomic(page, KM_USER0);
ret = CMPXCHG(&table[index], orig_pte, new_pte);
@@ -97,11 +101,15 @@ static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
return (ret != orig_pte);
}
-static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
+static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte,
+ bool last)
{
unsigned access;
access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
+ if (last && !is_dirty_gpte(gpte))
+ access &= ~ACC_WRITE_MASK;
+
#if PTTYPE == 64
if (vcpu->arch.mmu.nx)
access &= ~(gpte >> PT64_NX_SHIFT);
@@ -109,6 +117,24 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
return access;
}
+static bool FNAME(is_last_gpte)(struct guest_walker *walker,
+ struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ pt_element_t gpte)
+{
+ if (walker->level == PT_PAGE_TABLE_LEVEL)
+ return true;
+
+ if ((walker->level == PT_DIRECTORY_LEVEL) && is_large_pte(gpte) &&
+ (PTTYPE == 64 || is_pse(vcpu)))
+ return true;
+
+ if ((walker->level == PT_PDPE_LEVEL) && is_large_pte(gpte) &&
+ (mmu->root_level == PT64_ROOT_LEVEL))
+ return true;
+
+ return false;
+}
+
/*
* Fetch a guest pte for a guest virtual address
*/
@@ -117,21 +143,21 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
gva_t addr, u32 access)
{
pt_element_t pte;
+ pt_element_t __user *uninitialized_var(ptep_user);
gfn_t table_gfn;
unsigned index, pt_access, uninitialized_var(pte_access);
gpa_t pte_gpa;
- bool eperm, present, rsvd_fault;
- int offset, write_fault, user_fault, fetch_fault;
-
- write_fault = access & PFERR_WRITE_MASK;
- user_fault = access & PFERR_USER_MASK;
- fetch_fault = access & PFERR_FETCH_MASK;
+ bool eperm;
+ int offset;
+ const int write_fault = access & PFERR_WRITE_MASK;
+ const int user_fault = access & PFERR_USER_MASK;
+ const int fetch_fault = access & PFERR_FETCH_MASK;
+ u16 errcode = 0;
trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
fetch_fault);
-walk:
- present = true;
- eperm = rsvd_fault = false;
+retry_walk:
+ eperm = false;
walker->level = mmu->root_level;
pte = mmu->get_cr3(vcpu);
@@ -139,10 +165,8 @@ walk:
if (walker->level == PT32E_ROOT_LEVEL) {
pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3);
trace_kvm_mmu_paging_element(pte, walker->level);
- if (!is_present_gpte(pte)) {
- present = false;
+ if (!is_present_gpte(pte))
goto error;
- }
--walker->level;
}
#endif
@@ -152,6 +176,9 @@ walk:
pt_access = ACC_ALL;
for (;;) {
+ gfn_t real_gfn;
+ unsigned long host_addr;
+
index = PT_INDEX(addr, walker->level);
table_gfn = gpte_to_gfn(pte);
@@ -160,63 +187,69 @@ walk:
walker->table_gfn[walker->level - 1] = table_gfn;
walker->pte_gpa[walker->level - 1] = pte_gpa;
- if (kvm_read_guest_page_mmu(vcpu, mmu, table_gfn, &pte,
- offset, sizeof(pte),
- PFERR_USER_MASK|PFERR_WRITE_MASK)) {
- present = false;
- break;
- }
+ real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
+ PFERR_USER_MASK|PFERR_WRITE_MASK);
+ if (unlikely(real_gfn == UNMAPPED_GVA))
+ goto error;
+ real_gfn = gpa_to_gfn(real_gfn);
+
+ host_addr = gfn_to_hva(vcpu->kvm, real_gfn);
+ if (unlikely(kvm_is_error_hva(host_addr)))
+ goto error;
+
+ ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
+ if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte))))
+ goto error;
trace_kvm_mmu_paging_element(pte, walker->level);
- if (!is_present_gpte(pte)) {
- present = false;
- break;
- }
+ if (unlikely(!is_present_gpte(pte)))
+ goto error;
- if (is_rsvd_bits_set(&vcpu->arch.mmu, pte, walker->level)) {
- rsvd_fault = true;
- break;
+ if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
+ walker->level))) {
+ errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
+ goto error;
}
- if (write_fault && !is_writable_pte(pte))
- if (user_fault || is_write_protection(vcpu))
- eperm = true;
-
- if (user_fault && !(pte & PT_USER_MASK))
+ if (!check_write_user_access(vcpu, write_fault, user_fault,
+ pte))
eperm = true;
#if PTTYPE == 64
- if (fetch_fault && (pte & PT64_NX_MASK))
+ if (unlikely(fetch_fault && (pte & PT64_NX_MASK)))
eperm = true;
#endif
- if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) {
+ if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) {
+ int ret;
trace_kvm_mmu_set_accessed_bit(table_gfn, index,
sizeof(pte));
- if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
- index, pte, pte|PT_ACCESSED_MASK))
- goto walk;
+ ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
+ pte, pte|PT_ACCESSED_MASK);
+ if (unlikely(ret < 0))
+ goto error;
+ else if (ret)
+ goto retry_walk;
+
mark_page_dirty(vcpu->kvm, table_gfn);
pte |= PT_ACCESSED_MASK;
}
- pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
-
walker->ptes[walker->level - 1] = pte;
- if ((walker->level == PT_PAGE_TABLE_LEVEL) ||
- ((walker->level == PT_DIRECTORY_LEVEL) &&
- is_large_pte(pte) &&
- (PTTYPE == 64 || is_pse(vcpu))) ||
- ((walker->level == PT_PDPE_LEVEL) &&
- is_large_pte(pte) &&
- mmu->root_level == PT64_ROOT_LEVEL)) {
+ if (FNAME(is_last_gpte)(walker, vcpu, mmu, pte)) {
int lvl = walker->level;
gpa_t real_gpa;
gfn_t gfn;
u32 ac;
+ /* check if the kernel is fetching from user page */
+ if (unlikely(pte_access & PT_USER_MASK) &&
+ kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
+ if (fetch_fault && !user_fault)
+ eperm = true;
+
gfn = gpte_to_gfn_lvl(pte, lvl);
gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
@@ -237,26 +270,32 @@ walk:
break;
}
- pt_access = pte_access;
+ pt_access &= FNAME(gpte_access)(vcpu, pte, false);
--walker->level;
}
- if (!present || eperm || rsvd_fault)
+ if (unlikely(eperm)) {
+ errcode |= PFERR_PRESENT_MASK;
goto error;
+ }
- if (write_fault && !is_dirty_gpte(pte)) {
- bool ret;
+ if (write_fault && unlikely(!is_dirty_gpte(pte))) {
+ int ret;
trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
- ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
- pte|PT_DIRTY_MASK);
- if (ret)
- goto walk;
+ ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
+ pte, pte|PT_DIRTY_MASK);
+ if (unlikely(ret < 0))
+ goto error;
+ else if (ret)
+ goto retry_walk;
+
mark_page_dirty(vcpu->kvm, table_gfn);
pte |= PT_DIRTY_MASK;
walker->ptes[walker->level - 1] = pte;
}
+ pte_access = pt_access & FNAME(gpte_access)(vcpu, pte, true);
walker->pt_access = pt_access;
walker->pte_access = pte_access;
pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
@@ -264,19 +303,14 @@ walk:
return 1;
error:
+ errcode |= write_fault | user_fault;
+ if (fetch_fault && (mmu->nx ||
+ kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)))
+ errcode |= PFERR_FETCH_MASK;
+
walker->fault.vector = PF_VECTOR;
walker->fault.error_code_valid = true;
- walker->fault.error_code = 0;
- if (present)
- walker->fault.error_code |= PFERR_PRESENT_MASK;
-
- walker->fault.error_code |= write_fault | user_fault;
-
- if (fetch_fault && mmu->nx)
- walker->fault.error_code |= PFERR_FETCH_MASK;
- if (rsvd_fault)
- walker->fault.error_code |= PFERR_RSVD_MASK;
-
+ walker->fault.error_code = errcode;
walker->fault.address = addr;
walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
@@ -303,16 +337,11 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp, u64 *spte,
pt_element_t gpte)
{
- u64 nonpresent = shadow_trap_nonpresent_pte;
-
if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
goto no_present;
- if (!is_present_gpte(gpte)) {
- if (!sp->unsync)
- nonpresent = shadow_notrap_nonpresent_pte;
+ if (!is_present_gpte(gpte))
goto no_present;
- }
if (!(gpte & PT_ACCESSED_MASK))
goto no_present;
@@ -320,12 +349,12 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
return false;
no_present:
- drop_spte(vcpu->kvm, spte, nonpresent);
+ drop_spte(vcpu->kvm, spte);
return true;
}
static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- u64 *spte, const void *pte, unsigned long mmu_seq)
+ u64 *spte, const void *pte)
{
pt_element_t gpte;
unsigned pte_access;
@@ -336,21 +365,19 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
return;
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
- pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+ pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true);
pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
- if (is_error_pfn(pfn)) {
+ if (mmu_invalid_pfn(pfn)) {
kvm_release_pfn_clean(pfn);
return;
}
- if (mmu_notifier_retry(vcpu, mmu_seq))
- return;
/*
* we call mmu_set_spte() with host_writable = true because that
* vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
*/
mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
- is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL,
+ NULL, PT_PAGE_TABLE_LEVEL,
gpte_to_gfn(gpte), pfn, true, true);
}
@@ -401,12 +428,11 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
unsigned pte_access;
gfn_t gfn;
pfn_t pfn;
- bool dirty;
if (spte == sptep)
continue;
- if (*spte != shadow_trap_nonpresent_pte)
+ if (is_shadow_present_pte(*spte))
continue;
gpte = gptep[i];
@@ -414,18 +440,18 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
continue;
- pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+ pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte,
+ true);
gfn = gpte_to_gfn(gpte);
- dirty = is_dirty_gpte(gpte);
pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
- (pte_access & ACC_WRITE_MASK) && dirty);
- if (is_error_pfn(pfn)) {
+ pte_access & ACC_WRITE_MASK);
+ if (mmu_invalid_pfn(pfn)) {
kvm_release_pfn_clean(pfn);
break;
}
mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
- dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn,
+ NULL, PT_PAGE_TABLE_LEVEL, gfn,
pfn, true, true);
}
}
@@ -436,12 +462,11 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct guest_walker *gw,
int user_fault, int write_fault, int hlevel,
- int *ptwrite, pfn_t pfn, bool map_writable,
+ int *emulate, pfn_t pfn, bool map_writable,
bool prefault)
{
unsigned access = gw->pt_access;
struct kvm_mmu_page *sp = NULL;
- bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
int top_level;
unsigned direct_access;
struct kvm_shadow_walk_iterator it;
@@ -449,9 +474,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
if (!is_present_gpte(gw->ptes[gw->level - 1]))
return NULL;
- direct_access = gw->pt_access & gw->pte_access;
- if (!dirty)
- direct_access &= ~ACC_WRITE_MASK;
+ direct_access = gw->pte_access;
top_level = vcpu->arch.mmu.root_level;
if (top_level == PT32E_ROOT_LEVEL)
@@ -509,8 +532,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
link_shadow_page(it.sptep, sp);
}
- mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
- user_fault, write_fault, dirty, ptwrite, it.level,
+ mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
+ user_fault, write_fault, emulate, it.level,
gw->gfn, pfn, prefault, map_writable);
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
@@ -544,7 +567,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
int user_fault = error_code & PFERR_USER_MASK;
struct guest_walker walker;
u64 *sptep;
- int write_pt = 0;
+ int emulate = 0;
int r;
pfn_t pfn;
int level = PT_PAGE_TABLE_LEVEL;
@@ -554,6 +577,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
+ if (unlikely(error_code & PFERR_RSVD_MASK))
+ return handle_mmio_page_fault(vcpu, addr, error_code,
+ mmu_is_nested(vcpu));
+
r = mmu_topup_memory_caches(vcpu);
if (r)
return r;
@@ -592,9 +619,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
&map_writable))
return 0;
- /* mmio */
- if (is_error_pfn(pfn))
- return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn);
+ if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr,
+ walker.gfn, pfn, walker.pte_access, &r))
+ return r;
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -605,19 +632,19 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
if (!force_pt_level)
transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
- level, &write_pt, pfn, map_writable, prefault);
+ level, &emulate, pfn, map_writable, prefault);
(void)sptep;
- pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
- sptep, *sptep, write_pt);
+ pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__,
+ sptep, *sptep, emulate);
- if (!write_pt)
+ if (!emulate)
vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
++vcpu->stat.pf_fixed;
trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
spin_unlock(&vcpu->kvm->mmu_lock);
- return write_pt;
+ return emulate;
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
@@ -634,6 +661,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
u64 *sptep;
int need_flush = 0;
+ vcpu_clear_mmio_info(vcpu, gva);
+
spin_lock(&vcpu->kvm->mmu_lock);
for_each_shadow_entry(vcpu, gva, iterator) {
@@ -657,11 +686,11 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
if (is_shadow_present_pte(*sptep)) {
if (is_large_pte(*sptep))
--vcpu->kvm->stat.lpages;
- drop_spte(vcpu->kvm, sptep,
- shadow_trap_nonpresent_pte);
+ drop_spte(vcpu->kvm, sptep);
need_flush = 1;
- } else
- __set_spte(sptep, shadow_trap_nonpresent_pte);
+ } else if (is_mmio_spte(*sptep))
+ mmu_spte_clear_no_track(sptep);
+
break;
}
@@ -721,36 +750,6 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
return gpa;
}
-static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp)
-{
- int i, j, offset, r;
- pt_element_t pt[256 / sizeof(pt_element_t)];
- gpa_t pte_gpa;
-
- if (sp->role.direct
- || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
- nonpaging_prefetch_page(vcpu, sp);
- return;
- }
-
- pte_gpa = gfn_to_gpa(sp->gfn);
- if (PTTYPE == 32) {
- offset = sp->role.quadrant << PT64_LEVEL_BITS;
- pte_gpa += offset * sizeof(pt_element_t);
- }
-
- for (i = 0; i < PT64_ENT_PER_PAGE; i += ARRAY_SIZE(pt)) {
- r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt);
- pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t);
- for (j = 0; j < ARRAY_SIZE(pt); ++j)
- if (r || is_present_gpte(pt[j]))
- sp->spt[i+j] = shadow_trap_nonpresent_pte;
- else
- sp->spt[i+j] = shadow_notrap_nonpresent_pte;
- }
-}
-
/*
* Using the cached information from sp->gfns is safe because:
* - The spte has a reference to the struct page, so the pfn for a given gfn
@@ -786,7 +785,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
gpa_t pte_gpa;
gfn_t gfn;
- if (!is_shadow_present_pte(sp->spt[i]))
+ if (!sp->spt[i])
continue;
pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
@@ -795,26 +794,30 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
sizeof(pt_element_t)))
return -EINVAL;
- gfn = gpte_to_gfn(gpte);
-
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
vcpu->kvm->tlbs_dirty++;
continue;
}
+ gfn = gpte_to_gfn(gpte);
+ pte_access = sp->role.access;
+ pte_access &= FNAME(gpte_access)(vcpu, gpte, true);
+
+ if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present))
+ continue;
+
if (gfn != sp->gfns[i]) {
- drop_spte(vcpu->kvm, &sp->spt[i],
- shadow_trap_nonpresent_pte);
+ drop_spte(vcpu->kvm, &sp->spt[i]);
vcpu->kvm->tlbs_dirty++;
continue;
}
nr_present++;
- pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+
host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
- is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
+ PT_PAGE_TABLE_LEVEL, gfn,
spte_to_pfn(sp->spt[i]), true, false,
host_writable);
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 6bb15d583e4..475d1c94850 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -63,6 +63,10 @@ MODULE_LICENSE("GPL");
#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
+#define TSC_RATIO_RSVD 0xffffff0000000000ULL
+#define TSC_RATIO_MIN 0x0000000000000001ULL
+#define TSC_RATIO_MAX 0x000000ffffffffffULL
+
static bool erratum_383_found __read_mostly;
static const u32 host_save_user_msrs[] = {
@@ -93,14 +97,6 @@ struct nested_state {
/* A VMEXIT is required but not yet emulated */
bool exit_required;
- /*
- * If we vmexit during an instruction emulation we need this to restore
- * the l1 guest rip after the emulation
- */
- unsigned long vmexit_rip;
- unsigned long vmexit_rsp;
- unsigned long vmexit_rax;
-
/* cache for intercepts of the guest */
u32 intercept_cr;
u32 intercept_dr;
@@ -144,8 +140,13 @@ struct vcpu_svm {
unsigned int3_injected;
unsigned long int3_rip;
u32 apf_reason;
+
+ u64 tsc_ratio;
};
+static DEFINE_PER_CPU(u64, current_tsc_ratio);
+#define TSC_RATIO_DEFAULT 0x0100000000ULL
+
#define MSR_INVALID 0xffffffffU
static struct svm_direct_access_msrs {
@@ -190,6 +191,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm);
static int nested_svm_vmexit(struct vcpu_svm *svm);
static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
bool has_error_code, u32 error_code);
+static u64 __scale_tsc(u64 ratio, u64 tsc);
enum {
VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
@@ -376,7 +378,6 @@ struct svm_cpu_data {
};
static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
-static uint32_t svm_features;
struct svm_init_data {
int cpu;
@@ -569,6 +570,10 @@ static int has_svm(void)
static void svm_hardware_disable(void *garbage)
{
+ /* Make sure we clean up behind us */
+ if (static_cpu_has(X86_FEATURE_TSCRATEMSR))
+ wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
+
cpu_svm_disable();
}
@@ -610,6 +615,11 @@ static int svm_hardware_enable(void *garbage)
wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
+ if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+ wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
+ __get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT;
+ }
+
svm_init_erratum_383();
return 0;
@@ -791,6 +801,23 @@ static __init int svm_hardware_setup(void)
if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
kvm_enable_efer_bits(EFER_FFXSR);
+ if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+ u64 max;
+
+ kvm_has_tsc_control = true;
+
+ /*
+ * Make sure the user can only configure tsc_khz values that
+ * fit into a signed integer.
+ * A min value is not calculated needed because it will always
+ * be 1 on all machines and a value of 0 is used to disable
+ * tsc-scaling for the vcpu.
+ */
+ max = min(0x7fffffffULL, __scale_tsc(tsc_khz, TSC_RATIO_MAX));
+
+ kvm_max_guest_tsc_khz = max;
+ }
+
if (nested) {
printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
@@ -802,8 +829,6 @@ static __init int svm_hardware_setup(void)
goto err;
}
- svm_features = cpuid_edx(SVM_CPUID_FUNC);
-
if (!boot_cpu_has(X86_FEATURE_NPT))
npt_enabled = false;
@@ -854,6 +879,64 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
seg->base = 0;
}
+static u64 __scale_tsc(u64 ratio, u64 tsc)
+{
+ u64 mult, frac, _tsc;
+
+ mult = ratio >> 32;
+ frac = ratio & ((1ULL << 32) - 1);
+
+ _tsc = tsc;
+ _tsc *= mult;
+ _tsc += (tsc >> 32) * frac;
+ _tsc += ((tsc & ((1ULL << 32) - 1)) * frac) >> 32;
+
+ return _tsc;
+}
+
+static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 _tsc = tsc;
+
+ if (svm->tsc_ratio != TSC_RATIO_DEFAULT)
+ _tsc = __scale_tsc(svm->tsc_ratio, tsc);
+
+ return _tsc;
+}
+
+static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 ratio;
+ u64 khz;
+
+ /* TSC scaling supported? */
+ if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR))
+ return;
+
+ /* TSC-Scaling disabled or guest TSC same frequency as host TSC? */
+ if (user_tsc_khz == 0) {
+ vcpu->arch.virtual_tsc_khz = 0;
+ svm->tsc_ratio = TSC_RATIO_DEFAULT;
+ return;
+ }
+
+ khz = user_tsc_khz;
+
+ /* TSC scaling required - calculate ratio */
+ ratio = khz << 32;
+ do_div(ratio, tsc_khz);
+
+ if (ratio == 0 || ratio & TSC_RATIO_RSVD) {
+ WARN_ONCE(1, "Invalid TSC ratio - virtual-tsc-khz=%u\n",
+ user_tsc_khz);
+ return;
+ }
+ vcpu->arch.virtual_tsc_khz = user_tsc_khz;
+ svm->tsc_ratio = ratio;
+}
+
static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -880,6 +963,15 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
}
+static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
+{
+ u64 tsc;
+
+ tsc = svm_scale_tsc(vcpu, native_read_tsc());
+
+ return target_tsc - tsc;
+}
+
static void init_vmcb(struct vcpu_svm *svm)
{
struct vmcb_control_area *control = &svm->vmcb->control;
@@ -975,7 +1067,7 @@ static void init_vmcb(struct vcpu_svm *svm)
svm_set_efer(&svm->vcpu, 0);
save->dr6 = 0xffff0ff0;
save->dr7 = 0x400;
- save->rflags = 2;
+ kvm_set_rflags(&svm->vcpu, 2);
save->rip = 0x0000fff0;
svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
@@ -1048,6 +1140,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
goto out;
}
+ svm->tsc_ratio = TSC_RATIO_DEFAULT;
+
err = kvm_vcpu_init(&svm->vcpu, kvm, id);
if (err)
goto free_svm;
@@ -1141,6 +1235,12 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
+
+ if (static_cpu_has(X86_FEATURE_TSCRATEMSR) &&
+ svm->tsc_ratio != __get_cpu_var(current_tsc_ratio)) {
+ __get_cpu_var(current_tsc_ratio) = svm->tsc_ratio;
+ wrmsrl(MSR_AMD64_TSC_RATIO, svm->tsc_ratio);
+ }
}
static void svm_vcpu_put(struct kvm_vcpu *vcpu)
@@ -1365,31 +1465,6 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (is_guest_mode(vcpu)) {
- /*
- * We are here because we run in nested mode, the host kvm
- * intercepts cr0 writes but the l1 hypervisor does not.
- * But the L1 hypervisor may intercept selective cr0 writes.
- * This needs to be checked here.
- */
- unsigned long old, new;
-
- /* Remove bits that would trigger a real cr0 write intercept */
- old = vcpu->arch.cr0 & SVM_CR0_SELECTIVE_MASK;
- new = cr0 & SVM_CR0_SELECTIVE_MASK;
-
- if (old == new) {
- /* cr0 write with ts and mp unchanged */
- svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
- if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) {
- svm->nested.vmexit_rip = kvm_rip_read(vcpu);
- svm->nested.vmexit_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
- svm->nested.vmexit_rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
- return;
- }
- }
- }
-
#ifdef CONFIG_X86_64
if (vcpu->arch.efer & EFER_LME) {
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
@@ -1421,11 +1496,14 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
update_cr0_intercept(svm);
}
-static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
+ if (cr4 & X86_CR4_VMXE)
+ return 1;
+
if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
svm_flush_tlb(vcpu);
@@ -1435,6 +1513,7 @@ static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
cr4 |= host_cr4_mce;
to_svm(vcpu)->vmcb->save.cr4 = cr4;
mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
+ return 0;
}
static void svm_set_segment(struct kvm_vcpu *vcpu,
@@ -2127,7 +2206,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu);
nested_vmcb->save.cr2 = vmcb->save.cr2;
nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
- nested_vmcb->save.rflags = vmcb->save.rflags;
+ nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu);
nested_vmcb->save.rip = vmcb->save.rip;
nested_vmcb->save.rsp = vmcb->save.rsp;
nested_vmcb->save.rax = vmcb->save.rax;
@@ -2184,7 +2263,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
svm->vmcb->save.ds = hsave->save.ds;
svm->vmcb->save.gdtr = hsave->save.gdtr;
svm->vmcb->save.idtr = hsave->save.idtr;
- svm->vmcb->save.rflags = hsave->save.rflags;
+ kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
svm_set_efer(&svm->vcpu, hsave->save.efer);
svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
svm_set_cr4(&svm->vcpu, hsave->save.cr4);
@@ -2312,7 +2391,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
hsave->save.efer = svm->vcpu.arch.efer;
hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
hsave->save.cr4 = svm->vcpu.arch.cr4;
- hsave->save.rflags = vmcb->save.rflags;
+ hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
hsave->save.rip = kvm_rip_read(&svm->vcpu);
hsave->save.rsp = vmcb->save.rsp;
hsave->save.rax = vmcb->save.rax;
@@ -2323,7 +2402,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
copy_vmcb_control_area(hsave, vmcb);
- if (svm->vmcb->save.rflags & X86_EFLAGS_IF)
+ if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
svm->vcpu.arch.hflags |= HF_HIF_MASK;
else
svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
@@ -2341,7 +2420,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
svm->vmcb->save.ds = nested_vmcb->save.ds;
svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
svm->vmcb->save.idtr = nested_vmcb->save.idtr;
- svm->vmcb->save.rflags = nested_vmcb->save.rflags;
+ kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags);
svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
@@ -2443,13 +2522,13 @@ static int vmload_interception(struct vcpu_svm *svm)
if (nested_svm_check_permissions(svm))
return 1;
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
- skip_emulated_instruction(&svm->vcpu);
-
nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
if (!nested_vmcb)
return 1;
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ skip_emulated_instruction(&svm->vcpu);
+
nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
nested_svm_unmap(page);
@@ -2464,13 +2543,13 @@ static int vmsave_interception(struct vcpu_svm *svm)
if (nested_svm_check_permissions(svm))
return 1;
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
- skip_emulated_instruction(&svm->vcpu);
-
nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
if (!nested_vmcb)
return 1;
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ skip_emulated_instruction(&svm->vcpu);
+
nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
nested_svm_unmap(page);
@@ -2676,6 +2755,29 @@ static int emulate_on_interception(struct vcpu_svm *svm)
return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
}
+bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val)
+{
+ unsigned long cr0 = svm->vcpu.arch.cr0;
+ bool ret = false;
+ u64 intercept;
+
+ intercept = svm->nested.intercept;
+
+ if (!is_guest_mode(&svm->vcpu) ||
+ (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))))
+ return false;
+
+ cr0 &= ~SVM_CR0_SELECTIVE_MASK;
+ val &= ~SVM_CR0_SELECTIVE_MASK;
+
+ if (cr0 ^ val) {
+ svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
+ ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
+ }
+
+ return ret;
+}
+
#define CR_VALID (1ULL << 63)
static int cr_interception(struct vcpu_svm *svm)
@@ -2699,7 +2801,11 @@ static int cr_interception(struct vcpu_svm *svm)
val = kvm_register_read(&svm->vcpu, reg);
switch (cr) {
case 0:
- err = kvm_set_cr0(&svm->vcpu, val);
+ if (!check_selective_cr0_intercepted(svm, val))
+ err = kvm_set_cr0(&svm->vcpu, val);
+ else
+ return 1;
+
break;
case 3:
err = kvm_set_cr3(&svm->vcpu, val);
@@ -2744,23 +2850,6 @@ static int cr_interception(struct vcpu_svm *svm)
return 1;
}
-static int cr0_write_interception(struct vcpu_svm *svm)
-{
- struct kvm_vcpu *vcpu = &svm->vcpu;
- int r;
-
- r = cr_interception(svm);
-
- if (svm->nested.vmexit_rip) {
- kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip);
- kvm_register_write(vcpu, VCPU_REGS_RSP, svm->nested.vmexit_rsp);
- kvm_register_write(vcpu, VCPU_REGS_RAX, svm->nested.vmexit_rax);
- svm->nested.vmexit_rip = 0;
- }
-
- return r;
-}
-
static int dr_interception(struct vcpu_svm *svm)
{
int reg, dr;
@@ -2813,7 +2902,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
case MSR_IA32_TSC: {
struct vmcb *vmcb = get_host_vmcb(svm);
- *data = vmcb->control.tsc_offset + native_read_tsc();
+ *data = vmcb->control.tsc_offset +
+ svm_scale_tsc(vcpu, native_read_tsc());
+
break;
}
case MSR_STAR:
@@ -3048,7 +3139,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_READ_CR4] = cr_interception,
[SVM_EXIT_READ_CR8] = cr_interception,
[SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
- [SVM_EXIT_WRITE_CR0] = cr0_write_interception,
+ [SVM_EXIT_WRITE_CR0] = cr_interception,
[SVM_EXIT_WRITE_CR3] = cr_interception,
[SVM_EXIT_WRITE_CR4] = cr_interception,
[SVM_EXIT_WRITE_CR8] = cr8_write_interception,
@@ -3104,97 +3195,109 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_NPF] = pf_interception,
};
-void dump_vmcb(struct kvm_vcpu *vcpu)
+static void dump_vmcb(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb_control_area *control = &svm->vmcb->control;
struct vmcb_save_area *save = &svm->vmcb->save;
pr_err("VMCB Control Area:\n");
- pr_err("cr_read: %04x\n", control->intercept_cr & 0xffff);
- pr_err("cr_write: %04x\n", control->intercept_cr >> 16);
- pr_err("dr_read: %04x\n", control->intercept_dr & 0xffff);
- pr_err("dr_write: %04x\n", control->intercept_dr >> 16);
- pr_err("exceptions: %08x\n", control->intercept_exceptions);
- pr_err("intercepts: %016llx\n", control->intercept);
- pr_err("pause filter count: %d\n", control->pause_filter_count);
- pr_err("iopm_base_pa: %016llx\n", control->iopm_base_pa);
- pr_err("msrpm_base_pa: %016llx\n", control->msrpm_base_pa);
- pr_err("tsc_offset: %016llx\n", control->tsc_offset);
- pr_err("asid: %d\n", control->asid);
- pr_err("tlb_ctl: %d\n", control->tlb_ctl);
- pr_err("int_ctl: %08x\n", control->int_ctl);
- pr_err("int_vector: %08x\n", control->int_vector);
- pr_err("int_state: %08x\n", control->int_state);
- pr_err("exit_code: %08x\n", control->exit_code);
- pr_err("exit_info1: %016llx\n", control->exit_info_1);
- pr_err("exit_info2: %016llx\n", control->exit_info_2);
- pr_err("exit_int_info: %08x\n", control->exit_int_info);
- pr_err("exit_int_info_err: %08x\n", control->exit_int_info_err);
- pr_err("nested_ctl: %lld\n", control->nested_ctl);
- pr_err("nested_cr3: %016llx\n", control->nested_cr3);
- pr_err("event_inj: %08x\n", control->event_inj);
- pr_err("event_inj_err: %08x\n", control->event_inj_err);
- pr_err("lbr_ctl: %lld\n", control->lbr_ctl);
- pr_err("next_rip: %016llx\n", control->next_rip);
+ pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff);
+ pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16);
+ pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff);
+ pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16);
+ pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions);
+ pr_err("%-20s%016llx\n", "intercepts:", control->intercept);
+ pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
+ pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
+ pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
+ pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
+ pr_err("%-20s%d\n", "asid:", control->asid);
+ pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
+ pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
+ pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
+ pr_err("%-20s%08x\n", "int_state:", control->int_state);
+ pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
+ pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
+ pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
+ pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
+ pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
+ pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
+ pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
+ pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
+ pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
+ pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl);
+ pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
pr_err("VMCB State Save Area:\n");
- pr_err("es: s: %04x a: %04x l: %08x b: %016llx\n",
- save->es.selector, save->es.attrib,
- save->es.limit, save->es.base);
- pr_err("cs: s: %04x a: %04x l: %08x b: %016llx\n",
- save->cs.selector, save->cs.attrib,
- save->cs.limit, save->cs.base);
- pr_err("ss: s: %04x a: %04x l: %08x b: %016llx\n",
- save->ss.selector, save->ss.attrib,
- save->ss.limit, save->ss.base);
- pr_err("ds: s: %04x a: %04x l: %08x b: %016llx\n",
- save->ds.selector, save->ds.attrib,
- save->ds.limit, save->ds.base);
- pr_err("fs: s: %04x a: %04x l: %08x b: %016llx\n",
- save->fs.selector, save->fs.attrib,
- save->fs.limit, save->fs.base);
- pr_err("gs: s: %04x a: %04x l: %08x b: %016llx\n",
- save->gs.selector, save->gs.attrib,
- save->gs.limit, save->gs.base);
- pr_err("gdtr: s: %04x a: %04x l: %08x b: %016llx\n",
- save->gdtr.selector, save->gdtr.attrib,
- save->gdtr.limit, save->gdtr.base);
- pr_err("ldtr: s: %04x a: %04x l: %08x b: %016llx\n",
- save->ldtr.selector, save->ldtr.attrib,
- save->ldtr.limit, save->ldtr.base);
- pr_err("idtr: s: %04x a: %04x l: %08x b: %016llx\n",
- save->idtr.selector, save->idtr.attrib,
- save->idtr.limit, save->idtr.base);
- pr_err("tr: s: %04x a: %04x l: %08x b: %016llx\n",
- save->tr.selector, save->tr.attrib,
- save->tr.limit, save->tr.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "es:",
+ save->es.selector, save->es.attrib,
+ save->es.limit, save->es.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "cs:",
+ save->cs.selector, save->cs.attrib,
+ save->cs.limit, save->cs.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "ss:",
+ save->ss.selector, save->ss.attrib,
+ save->ss.limit, save->ss.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "ds:",
+ save->ds.selector, save->ds.attrib,
+ save->ds.limit, save->ds.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "fs:",
+ save->fs.selector, save->fs.attrib,
+ save->fs.limit, save->fs.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "gs:",
+ save->gs.selector, save->gs.attrib,
+ save->gs.limit, save->gs.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "gdtr:",
+ save->gdtr.selector, save->gdtr.attrib,
+ save->gdtr.limit, save->gdtr.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "ldtr:",
+ save->ldtr.selector, save->ldtr.attrib,
+ save->ldtr.limit, save->ldtr.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "idtr:",
+ save->idtr.selector, save->idtr.attrib,
+ save->idtr.limit, save->idtr.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "tr:",
+ save->tr.selector, save->tr.attrib,
+ save->tr.limit, save->tr.base);
pr_err("cpl: %d efer: %016llx\n",
save->cpl, save->efer);
- pr_err("cr0: %016llx cr2: %016llx\n",
- save->cr0, save->cr2);
- pr_err("cr3: %016llx cr4: %016llx\n",
- save->cr3, save->cr4);
- pr_err("dr6: %016llx dr7: %016llx\n",
- save->dr6, save->dr7);
- pr_err("rip: %016llx rflags: %016llx\n",
- save->rip, save->rflags);
- pr_err("rsp: %016llx rax: %016llx\n",
- save->rsp, save->rax);
- pr_err("star: %016llx lstar: %016llx\n",
- save->star, save->lstar);
- pr_err("cstar: %016llx sfmask: %016llx\n",
- save->cstar, save->sfmask);
- pr_err("kernel_gs_base: %016llx sysenter_cs: %016llx\n",
- save->kernel_gs_base, save->sysenter_cs);
- pr_err("sysenter_esp: %016llx sysenter_eip: %016llx\n",
- save->sysenter_esp, save->sysenter_eip);
- pr_err("gpat: %016llx dbgctl: %016llx\n",
- save->g_pat, save->dbgctl);
- pr_err("br_from: %016llx br_to: %016llx\n",
- save->br_from, save->br_to);
- pr_err("excp_from: %016llx excp_to: %016llx\n",
- save->last_excp_from, save->last_excp_to);
-
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "cr0:", save->cr0, "cr2:", save->cr2);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "cr3:", save->cr3, "cr4:", save->cr4);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "dr6:", save->dr6, "dr7:", save->dr7);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "rip:", save->rip, "rflags:", save->rflags);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "rsp:", save->rsp, "rax:", save->rax);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "star:", save->star, "lstar:", save->lstar);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "cstar:", save->cstar, "sfmask:", save->sfmask);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "kernel_gs_base:", save->kernel_gs_base,
+ "sysenter_cs:", save->sysenter_cs);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "sysenter_esp:", save->sysenter_esp,
+ "sysenter_eip:", save->sysenter_eip);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "br_from:", save->br_from, "br_to:", save->br_to);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "excp_from:", save->last_excp_from,
+ "excp_to:", save->last_excp_to);
}
static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
@@ -3384,7 +3487,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
return 0;
- ret = !!(vmcb->save.rflags & X86_EFLAGS_IF);
+ ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
if (is_guest_mode(vcpu))
return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
@@ -3871,6 +3974,186 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
update_cr0_intercept(svm);
}
+#define PRE_EX(exit) { .exit_code = (exit), \
+ .stage = X86_ICPT_PRE_EXCEPT, }
+#define POST_EX(exit) { .exit_code = (exit), \
+ .stage = X86_ICPT_POST_EXCEPT, }
+#define POST_MEM(exit) { .exit_code = (exit), \
+ .stage = X86_ICPT_POST_MEMACCESS, }
+
+static struct __x86_intercept {
+ u32 exit_code;
+ enum x86_intercept_stage stage;
+} x86_intercept_map[] = {
+ [x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0),
+ [x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0),
+ [x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0),
+ [x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0),
+ [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0),
+ [x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0),
+ [x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0),
+ [x86_intercept_sldt] = POST_EX(SVM_EXIT_LDTR_READ),
+ [x86_intercept_str] = POST_EX(SVM_EXIT_TR_READ),
+ [x86_intercept_lldt] = POST_EX(SVM_EXIT_LDTR_WRITE),
+ [x86_intercept_ltr] = POST_EX(SVM_EXIT_TR_WRITE),
+ [x86_intercept_sgdt] = POST_EX(SVM_EXIT_GDTR_READ),
+ [x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ),
+ [x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE),
+ [x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE),
+ [x86_intercept_vmrun] = POST_EX(SVM_EXIT_VMRUN),
+ [x86_intercept_vmmcall] = POST_EX(SVM_EXIT_VMMCALL),
+ [x86_intercept_vmload] = POST_EX(SVM_EXIT_VMLOAD),
+ [x86_intercept_vmsave] = POST_EX(SVM_EXIT_VMSAVE),
+ [x86_intercept_stgi] = POST_EX(SVM_EXIT_STGI),
+ [x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI),
+ [x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT),
+ [x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA),
+ [x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP),
+ [x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR),
+ [x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT),
+ [x86_intercept_invlpg] = POST_EX(SVM_EXIT_INVLPG),
+ [x86_intercept_invd] = POST_EX(SVM_EXIT_INVD),
+ [x86_intercept_wbinvd] = POST_EX(SVM_EXIT_WBINVD),
+ [x86_intercept_wrmsr] = POST_EX(SVM_EXIT_MSR),
+ [x86_intercept_rdtsc] = POST_EX(SVM_EXIT_RDTSC),
+ [x86_intercept_rdmsr] = POST_EX(SVM_EXIT_MSR),
+ [x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC),
+ [x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID),
+ [x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM),
+ [x86_intercept_pause] = PRE_EX(SVM_EXIT_PAUSE),
+ [x86_intercept_pushf] = PRE_EX(SVM_EXIT_PUSHF),
+ [x86_intercept_popf] = PRE_EX(SVM_EXIT_POPF),
+ [x86_intercept_intn] = PRE_EX(SVM_EXIT_SWINT),
+ [x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET),
+ [x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP),
+ [x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT),
+ [x86_intercept_in] = POST_EX(SVM_EXIT_IOIO),
+ [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO),
+ [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO),
+ [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO),
+};
+
+#undef PRE_EX
+#undef POST_EX
+#undef POST_MEM
+
+static int svm_check_intercept(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int vmexit, ret = X86EMUL_CONTINUE;
+ struct __x86_intercept icpt_info;
+ struct vmcb *vmcb = svm->vmcb;
+
+ if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
+ goto out;
+
+ icpt_info = x86_intercept_map[info->intercept];
+
+ if (stage != icpt_info.stage)
+ goto out;
+
+ switch (icpt_info.exit_code) {
+ case SVM_EXIT_READ_CR0:
+ if (info->intercept == x86_intercept_cr_read)
+ icpt_info.exit_code += info->modrm_reg;
+ break;
+ case SVM_EXIT_WRITE_CR0: {
+ unsigned long cr0, val;
+ u64 intercept;
+
+ if (info->intercept == x86_intercept_cr_write)
+ icpt_info.exit_code += info->modrm_reg;
+
+ if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0)
+ break;
+
+ intercept = svm->nested.intercept;
+
+ if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))
+ break;
+
+ cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
+ val = info->src_val & ~SVM_CR0_SELECTIVE_MASK;
+
+ if (info->intercept == x86_intercept_lmsw) {
+ cr0 &= 0xfUL;
+ val &= 0xfUL;
+ /* lmsw can't clear PE - catch this here */
+ if (cr0 & X86_CR0_PE)
+ val |= X86_CR0_PE;
+ }
+
+ if (cr0 ^ val)
+ icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
+
+ break;
+ }
+ case SVM_EXIT_READ_DR0:
+ case SVM_EXIT_WRITE_DR0:
+ icpt_info.exit_code += info->modrm_reg;
+ break;
+ case SVM_EXIT_MSR:
+ if (info->intercept == x86_intercept_wrmsr)
+ vmcb->control.exit_info_1 = 1;
+ else
+ vmcb->control.exit_info_1 = 0;
+ break;
+ case SVM_EXIT_PAUSE:
+ /*
+ * We get this for NOP only, but pause
+ * is rep not, check this here
+ */
+ if (info->rep_prefix != REPE_PREFIX)
+ goto out;
+ case SVM_EXIT_IOIO: {
+ u64 exit_info;
+ u32 bytes;
+
+ exit_info = (vcpu->arch.regs[VCPU_REGS_RDX] & 0xffff) << 16;
+
+ if (info->intercept == x86_intercept_in ||
+ info->intercept == x86_intercept_ins) {
+ exit_info |= SVM_IOIO_TYPE_MASK;
+ bytes = info->src_bytes;
+ } else {
+ bytes = info->dst_bytes;
+ }
+
+ if (info->intercept == x86_intercept_outs ||
+ info->intercept == x86_intercept_ins)
+ exit_info |= SVM_IOIO_STR_MASK;
+
+ if (info->rep_prefix)
+ exit_info |= SVM_IOIO_REP_MASK;
+
+ bytes = min(bytes, 4u);
+
+ exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
+
+ exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
+
+ vmcb->control.exit_info_1 = exit_info;
+ vmcb->control.exit_info_2 = info->next_rip;
+
+ break;
+ }
+ default:
+ break;
+ }
+
+ vmcb->control.next_rip = info->next_rip;
+ vmcb->control.exit_code = icpt_info.exit_code;
+ vmexit = nested_svm_exit_handled(svm);
+
+ ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
+ : X86EMUL_CONTINUE;
+
+out:
+ return ret;
+}
+
static struct kvm_x86_ops svm_x86_ops = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
@@ -3952,10 +4235,14 @@ static struct kvm_x86_ops svm_x86_ops = {
.has_wbinvd_exit = svm_has_wbinvd_exit,
+ .set_tsc_khz = svm_set_tsc_khz,
.write_tsc_offset = svm_write_tsc_offset,
.adjust_tsc_offset = svm_adjust_tsc_offset,
+ .compute_tsc_offset = svm_compute_tsc_offset,
.set_tdp_cr3 = set_tdp_cr3,
+
+ .check_intercept = svm_check_intercept,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index abd86e865be..ae432ea1cd8 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -15,7 +15,7 @@
#include <linux/kvm_host.h>
#include <linux/kvm.h>
#include <linux/hrtimer.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include "kvm_timer.h"
static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index db932760ea8..3ff898c104f 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -675,12 +675,12 @@ TRACE_EVENT(kvm_emulate_insn,
),
TP_fast_assign(
- __entry->rip = vcpu->arch.emulate_ctxt.decode.fetch.start;
+ __entry->rip = vcpu->arch.emulate_ctxt.fetch.start;
__entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS);
- __entry->len = vcpu->arch.emulate_ctxt.decode.eip
- - vcpu->arch.emulate_ctxt.decode.fetch.start;
+ __entry->len = vcpu->arch.emulate_ctxt._eip
+ - vcpu->arch.emulate_ctxt.fetch.start;
memcpy(__entry->insn,
- vcpu->arch.emulate_ctxt.decode.fetch.data,
+ vcpu->arch.emulate_ctxt.fetch.data,
15);
__entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode);
__entry->failed = failed;
@@ -698,6 +698,29 @@ TRACE_EVENT(kvm_emulate_insn,
#define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0)
#define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1)
+TRACE_EVENT(
+ vcpu_match_mmio,
+ TP_PROTO(gva_t gva, gpa_t gpa, bool write, bool gpa_match),
+ TP_ARGS(gva, gpa, write, gpa_match),
+
+ TP_STRUCT__entry(
+ __field(gva_t, gva)
+ __field(gpa_t, gpa)
+ __field(bool, write)
+ __field(bool, gpa_match)
+ ),
+
+ TP_fast_assign(
+ __entry->gva = gva;
+ __entry->gpa = gpa;
+ __entry->write = write;
+ __entry->gpa_match = gpa_match
+ ),
+
+ TP_printk("gva %#lx gpa %#llx %s %s", __entry->gva, __entry->gpa,
+ __entry->write ? "Write" : "Read",
+ __entry->gpa_match ? "GPA" : "GVA")
+);
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5b4cdcbd154..e65a158dee6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -43,13 +43,12 @@
#include "trace.h"
#define __ex(x) __kvm_handle_fault_on_reboot(x)
+#define __ex_clear(x, reg) \
+ ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
-static int __read_mostly bypass_guest_pf = 1;
-module_param(bypass_guest_pf, bool, S_IRUGO);
-
static int __read_mostly enable_vpid = 1;
module_param_named(vpid, enable_vpid, bool, 0444);
@@ -72,6 +71,14 @@ module_param(vmm_exclusive, bool, S_IRUGO);
static int __read_mostly yield_on_hlt = 1;
module_param(yield_on_hlt, bool, S_IRUGO);
+/*
+ * If nested=1, nested virtualization is supported, i.e., guests may use
+ * VMX and be a hypervisor for its own guests. If nested=0, guests may not
+ * use VMX instructions.
+ */
+static int __read_mostly nested = 0;
+module_param(nested, bool, S_IRUGO);
+
#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
(X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
#define KVM_GUEST_CR0_MASK \
@@ -109,6 +116,7 @@ static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
module_param(ple_window, int, S_IRUGO);
#define NR_AUTOLOAD_MSRS 1
+#define VMCS02_POOL_SIZE 1
struct vmcs {
u32 revision_id;
@@ -116,20 +124,243 @@ struct vmcs {
char data[0];
};
+/*
+ * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
+ * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
+ * loaded on this CPU (so we can clear them if the CPU goes down).
+ */
+struct loaded_vmcs {
+ struct vmcs *vmcs;
+ int cpu;
+ int launched;
+ struct list_head loaded_vmcss_on_cpu_link;
+};
+
struct shared_msr_entry {
unsigned index;
u64 data;
u64 mask;
};
+/*
+ * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
+ * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
+ * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
+ * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
+ * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
+ * More than one of these structures may exist, if L1 runs multiple L2 guests.
+ * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
+ * underlying hardware which will be used to run L2.
+ * This structure is packed to ensure that its layout is identical across
+ * machines (necessary for live migration).
+ * If there are changes in this struct, VMCS12_REVISION must be changed.
+ */
+typedef u64 natural_width;
+struct __packed vmcs12 {
+ /* According to the Intel spec, a VMCS region must start with the
+ * following two fields. Then follow implementation-specific data.
+ */
+ u32 revision_id;
+ u32 abort;
+
+ u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
+ u32 padding[7]; /* room for future expansion */
+
+ u64 io_bitmap_a;
+ u64 io_bitmap_b;
+ u64 msr_bitmap;
+ u64 vm_exit_msr_store_addr;
+ u64 vm_exit_msr_load_addr;
+ u64 vm_entry_msr_load_addr;
+ u64 tsc_offset;
+ u64 virtual_apic_page_addr;
+ u64 apic_access_addr;
+ u64 ept_pointer;
+ u64 guest_physical_address;
+ u64 vmcs_link_pointer;
+ u64 guest_ia32_debugctl;
+ u64 guest_ia32_pat;
+ u64 guest_ia32_efer;
+ u64 guest_ia32_perf_global_ctrl;
+ u64 guest_pdptr0;
+ u64 guest_pdptr1;
+ u64 guest_pdptr2;
+ u64 guest_pdptr3;
+ u64 host_ia32_pat;
+ u64 host_ia32_efer;
+ u64 host_ia32_perf_global_ctrl;
+ u64 padding64[8]; /* room for future expansion */
+ /*
+ * To allow migration of L1 (complete with its L2 guests) between
+ * machines of different natural widths (32 or 64 bit), we cannot have
+ * unsigned long fields with no explict size. We use u64 (aliased
+ * natural_width) instead. Luckily, x86 is little-endian.
+ */
+ natural_width cr0_guest_host_mask;
+ natural_width cr4_guest_host_mask;
+ natural_width cr0_read_shadow;
+ natural_width cr4_read_shadow;
+ natural_width cr3_target_value0;
+ natural_width cr3_target_value1;
+ natural_width cr3_target_value2;
+ natural_width cr3_target_value3;
+ natural_width exit_qualification;
+ natural_width guest_linear_address;
+ natural_width guest_cr0;
+ natural_width guest_cr3;
+ natural_width guest_cr4;
+ natural_width guest_es_base;
+ natural_width guest_cs_base;
+ natural_width guest_ss_base;
+ natural_width guest_ds_base;
+ natural_width guest_fs_base;
+ natural_width guest_gs_base;
+ natural_width guest_ldtr_base;
+ natural_width guest_tr_base;
+ natural_width guest_gdtr_base;
+ natural_width guest_idtr_base;
+ natural_width guest_dr7;
+ natural_width guest_rsp;
+ natural_width guest_rip;
+ natural_width guest_rflags;
+ natural_width guest_pending_dbg_exceptions;
+ natural_width guest_sysenter_esp;
+ natural_width guest_sysenter_eip;
+ natural_width host_cr0;
+ natural_width host_cr3;
+ natural_width host_cr4;
+ natural_width host_fs_base;
+ natural_width host_gs_base;
+ natural_width host_tr_base;
+ natural_width host_gdtr_base;
+ natural_width host_idtr_base;
+ natural_width host_ia32_sysenter_esp;
+ natural_width host_ia32_sysenter_eip;
+ natural_width host_rsp;
+ natural_width host_rip;
+ natural_width paddingl[8]; /* room for future expansion */
+ u32 pin_based_vm_exec_control;
+ u32 cpu_based_vm_exec_control;
+ u32 exception_bitmap;
+ u32 page_fault_error_code_mask;
+ u32 page_fault_error_code_match;
+ u32 cr3_target_count;
+ u32 vm_exit_controls;
+ u32 vm_exit_msr_store_count;
+ u32 vm_exit_msr_load_count;
+ u32 vm_entry_controls;
+ u32 vm_entry_msr_load_count;
+ u32 vm_entry_intr_info_field;
+ u32 vm_entry_exception_error_code;
+ u32 vm_entry_instruction_len;
+ u32 tpr_threshold;
+ u32 secondary_vm_exec_control;
+ u32 vm_instruction_error;
+ u32 vm_exit_reason;
+ u32 vm_exit_intr_info;
+ u32 vm_exit_intr_error_code;
+ u32 idt_vectoring_info_field;
+ u32 idt_vectoring_error_code;
+ u32 vm_exit_instruction_len;
+ u32 vmx_instruction_info;
+ u32 guest_es_limit;
+ u32 guest_cs_limit;
+ u32 guest_ss_limit;
+ u32 guest_ds_limit;
+ u32 guest_fs_limit;
+ u32 guest_gs_limit;
+ u32 guest_ldtr_limit;
+ u32 guest_tr_limit;
+ u32 guest_gdtr_limit;
+ u32 guest_idtr_limit;
+ u32 guest_es_ar_bytes;
+ u32 guest_cs_ar_bytes;
+ u32 guest_ss_ar_bytes;
+ u32 guest_ds_ar_bytes;
+ u32 guest_fs_ar_bytes;
+ u32 guest_gs_ar_bytes;
+ u32 guest_ldtr_ar_bytes;
+ u32 guest_tr_ar_bytes;
+ u32 guest_interruptibility_info;
+ u32 guest_activity_state;
+ u32 guest_sysenter_cs;
+ u32 host_ia32_sysenter_cs;
+ u32 padding32[8]; /* room for future expansion */
+ u16 virtual_processor_id;
+ u16 guest_es_selector;
+ u16 guest_cs_selector;
+ u16 guest_ss_selector;
+ u16 guest_ds_selector;
+ u16 guest_fs_selector;
+ u16 guest_gs_selector;
+ u16 guest_ldtr_selector;
+ u16 guest_tr_selector;
+ u16 host_es_selector;
+ u16 host_cs_selector;
+ u16 host_ss_selector;
+ u16 host_ds_selector;
+ u16 host_fs_selector;
+ u16 host_gs_selector;
+ u16 host_tr_selector;
+};
+
+/*
+ * VMCS12_REVISION is an arbitrary id that should be changed if the content or
+ * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
+ * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
+ */
+#define VMCS12_REVISION 0x11e57ed0
+
+/*
+ * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
+ * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
+ * current implementation, 4K are reserved to avoid future complications.
+ */
+#define VMCS12_SIZE 0x1000
+
+/* Used to remember the last vmcs02 used for some recently used vmcs12s */
+struct vmcs02_list {
+ struct list_head list;
+ gpa_t vmptr;
+ struct loaded_vmcs vmcs02;
+};
+
+/*
+ * The nested_vmx structure is part of vcpu_vmx, and holds information we need
+ * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
+ */
+struct nested_vmx {
+ /* Has the level1 guest done vmxon? */
+ bool vmxon;
+
+ /* The guest-physical address of the current VMCS L1 keeps for L2 */
+ gpa_t current_vmptr;
+ /* The host-usable pointer to the above */
+ struct page *current_vmcs12_page;
+ struct vmcs12 *current_vmcs12;
+
+ /* vmcs02_list cache of VMCSs recently used to run L2 guests */
+ struct list_head vmcs02_pool;
+ int vmcs02_num;
+ u64 vmcs01_tsc_offset;
+ /* L2 must run next, and mustn't decide to exit to L1. */
+ bool nested_run_pending;
+ /*
+ * Guest pages referred to in vmcs02 with host-physical pointers, so
+ * we must keep them pinned while L2 runs.
+ */
+ struct page *apic_access_page;
+};
+
struct vcpu_vmx {
struct kvm_vcpu vcpu;
- struct list_head local_vcpus_link;
unsigned long host_rsp;
- int launched;
u8 fail;
+ u8 cpl;
+ bool nmi_known_unmasked;
u32 exit_intr_info;
u32 idt_vectoring_info;
+ ulong rflags;
struct shared_msr_entry *guest_msrs;
int nmsrs;
int save_nmsrs;
@@ -137,7 +368,14 @@ struct vcpu_vmx {
u64 msr_host_kernel_gs_base;
u64 msr_guest_kernel_gs_base;
#endif
- struct vmcs *vmcs;
+ /*
+ * loaded_vmcs points to the VMCS currently used in this vcpu. For a
+ * non-nested (L1) guest, it always points to vmcs01. For a nested
+ * guest (L2), it points to a different VMCS.
+ */
+ struct loaded_vmcs vmcs01;
+ struct loaded_vmcs *loaded_vmcs;
+ bool __launched; /* temporary, used in vmx_vcpu_run */
struct msr_autoload {
unsigned nr;
struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
@@ -159,6 +397,10 @@ struct vcpu_vmx {
u32 ar;
} tr, es, ds, fs, gs;
} rmode;
+ struct {
+ u32 bitmask; /* 4 bits per segment (1 bit per field) */
+ struct kvm_save_segment seg[8];
+ } segment_cache;
int vpid;
bool emulation_required;
@@ -169,6 +411,18 @@ struct vcpu_vmx {
u32 exit_reason;
bool rdtscp_enabled;
+
+ /* Support for a guest hypervisor (nested VMX) */
+ struct nested_vmx nested;
+};
+
+enum segment_cache_field {
+ SEG_FIELD_SEL = 0,
+ SEG_FIELD_BASE = 1,
+ SEG_FIELD_LIMIT = 2,
+ SEG_FIELD_AR = 3,
+
+ SEG_FIELD_NR = 4
};
static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -176,6 +430,174 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
return container_of(vcpu, struct vcpu_vmx, vcpu);
}
+#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
+#define FIELD(number, name) [number] = VMCS12_OFFSET(name)
+#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \
+ [number##_HIGH] = VMCS12_OFFSET(name)+4
+
+static unsigned short vmcs_field_to_offset_table[] = {
+ FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
+ FIELD(GUEST_ES_SELECTOR, guest_es_selector),
+ FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
+ FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
+ FIELD(GUEST_DS_SELECTOR, guest_ds_selector),
+ FIELD(GUEST_FS_SELECTOR, guest_fs_selector),
+ FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
+ FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
+ FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
+ FIELD(HOST_ES_SELECTOR, host_es_selector),
+ FIELD(HOST_CS_SELECTOR, host_cs_selector),
+ FIELD(HOST_SS_SELECTOR, host_ss_selector),
+ FIELD(HOST_DS_SELECTOR, host_ds_selector),
+ FIELD(HOST_FS_SELECTOR, host_fs_selector),
+ FIELD(HOST_GS_SELECTOR, host_gs_selector),
+ FIELD(HOST_TR_SELECTOR, host_tr_selector),
+ FIELD64(IO_BITMAP_A, io_bitmap_a),
+ FIELD64(IO_BITMAP_B, io_bitmap_b),
+ FIELD64(MSR_BITMAP, msr_bitmap),
+ FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
+ FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
+ FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
+ FIELD64(TSC_OFFSET, tsc_offset),
+ FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
+ FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
+ FIELD64(EPT_POINTER, ept_pointer),
+ FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
+ FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
+ FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
+ FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
+ FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
+ FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl),
+ FIELD64(GUEST_PDPTR0, guest_pdptr0),
+ FIELD64(GUEST_PDPTR1, guest_pdptr1),
+ FIELD64(GUEST_PDPTR2, guest_pdptr2),
+ FIELD64(GUEST_PDPTR3, guest_pdptr3),
+ FIELD64(HOST_IA32_PAT, host_ia32_pat),
+ FIELD64(HOST_IA32_EFER, host_ia32_efer),
+ FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
+ FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
+ FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
+ FIELD(EXCEPTION_BITMAP, exception_bitmap),
+ FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask),
+ FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match),
+ FIELD(CR3_TARGET_COUNT, cr3_target_count),
+ FIELD(VM_EXIT_CONTROLS, vm_exit_controls),
+ FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count),
+ FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count),
+ FIELD(VM_ENTRY_CONTROLS, vm_entry_controls),
+ FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count),
+ FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field),
+ FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code),
+ FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len),
+ FIELD(TPR_THRESHOLD, tpr_threshold),
+ FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control),
+ FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error),
+ FIELD(VM_EXIT_REASON, vm_exit_reason),
+ FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info),
+ FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code),
+ FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field),
+ FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code),
+ FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len),
+ FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info),
+ FIELD(GUEST_ES_LIMIT, guest_es_limit),
+ FIELD(GUEST_CS_LIMIT, guest_cs_limit),
+ FIELD(GUEST_SS_LIMIT, guest_ss_limit),
+ FIELD(GUEST_DS_LIMIT, guest_ds_limit),
+ FIELD(GUEST_FS_LIMIT, guest_fs_limit),
+ FIELD(GUEST_GS_LIMIT, guest_gs_limit),
+ FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit),
+ FIELD(GUEST_TR_LIMIT, guest_tr_limit),
+ FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit),
+ FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit),
+ FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes),
+ FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes),
+ FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes),
+ FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes),
+ FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes),
+ FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes),
+ FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes),
+ FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes),
+ FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info),
+ FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
+ FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
+ FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
+ FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
+ FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
+ FIELD(CR0_READ_SHADOW, cr0_read_shadow),
+ FIELD(CR4_READ_SHADOW, cr4_read_shadow),
+ FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
+ FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
+ FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
+ FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
+ FIELD(EXIT_QUALIFICATION, exit_qualification),
+ FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
+ FIELD(GUEST_CR0, guest_cr0),
+ FIELD(GUEST_CR3, guest_cr3),
+ FIELD(GUEST_CR4, guest_cr4),
+ FIELD(GUEST_ES_BASE, guest_es_base),
+ FIELD(GUEST_CS_BASE, guest_cs_base),
+ FIELD(GUEST_SS_BASE, guest_ss_base),
+ FIELD(GUEST_DS_BASE, guest_ds_base),
+ FIELD(GUEST_FS_BASE, guest_fs_base),
+ FIELD(GUEST_GS_BASE, guest_gs_base),
+ FIELD(GUEST_LDTR_BASE, guest_ldtr_base),
+ FIELD(GUEST_TR_BASE, guest_tr_base),
+ FIELD(GUEST_GDTR_BASE, guest_gdtr_base),
+ FIELD(GUEST_IDTR_BASE, guest_idtr_base),
+ FIELD(GUEST_DR7, guest_dr7),
+ FIELD(GUEST_RSP, guest_rsp),
+ FIELD(GUEST_RIP, guest_rip),
+ FIELD(GUEST_RFLAGS, guest_rflags),
+ FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions),
+ FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp),
+ FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip),
+ FIELD(HOST_CR0, host_cr0),
+ FIELD(HOST_CR3, host_cr3),
+ FIELD(HOST_CR4, host_cr4),
+ FIELD(HOST_FS_BASE, host_fs_base),
+ FIELD(HOST_GS_BASE, host_gs_base),
+ FIELD(HOST_TR_BASE, host_tr_base),
+ FIELD(HOST_GDTR_BASE, host_gdtr_base),
+ FIELD(HOST_IDTR_BASE, host_idtr_base),
+ FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp),
+ FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip),
+ FIELD(HOST_RSP, host_rsp),
+ FIELD(HOST_RIP, host_rip),
+};
+static const int max_vmcs_field = ARRAY_SIZE(vmcs_field_to_offset_table);
+
+static inline short vmcs_field_to_offset(unsigned long field)
+{
+ if (field >= max_vmcs_field || vmcs_field_to_offset_table[field] == 0)
+ return -1;
+ return vmcs_field_to_offset_table[field];
+}
+
+static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->nested.current_vmcs12;
+}
+
+static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
+{
+ struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
+ if (is_error_page(page)) {
+ kvm_release_page_clean(page);
+ return NULL;
+ }
+ return page;
+}
+
+static void nested_release_page(struct page *page)
+{
+ kvm_release_page_dirty(page);
+}
+
+static void nested_release_page_clean(struct page *page)
+{
+ kvm_release_page_clean(page);
+}
+
static u64 construct_eptp(unsigned long root_hpa);
static void kvm_cpu_vmxon(u64 addr);
static void kvm_cpu_vmxoff(void);
@@ -184,7 +606,11 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
-static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
+/*
+ * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
+ * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
+ */
+static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
static unsigned long *vmx_io_bitmap_a;
@@ -426,6 +852,35 @@ static inline bool report_flexpriority(void)
return flexpriority_enabled;
}
+static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
+{
+ return vmcs12->cpu_based_vm_exec_control & bit;
+}
+
+static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
+{
+ return (vmcs12->cpu_based_vm_exec_control &
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
+ (vmcs12->secondary_vm_exec_control & bit);
+}
+
+static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
+ struct kvm_vcpu *vcpu)
+{
+ return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
+}
+
+static inline bool is_exception(u32 intr_info)
+{
+ return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
+ == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
+}
+
+static void nested_vmx_vmexit(struct kvm_vcpu *vcpu);
+static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12,
+ u32 reason, unsigned long qualification);
+
static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
{
int i;
@@ -485,6 +940,13 @@ static void vmcs_clear(struct vmcs *vmcs)
vmcs, phys_addr);
}
+static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
+{
+ vmcs_clear(loaded_vmcs->vmcs);
+ loaded_vmcs->cpu = -1;
+ loaded_vmcs->launched = 0;
+}
+
static void vmcs_load(struct vmcs *vmcs)
{
u64 phys_addr = __pa(vmcs);
@@ -494,29 +956,28 @@ static void vmcs_load(struct vmcs *vmcs)
: "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
: "cc", "memory");
if (error)
- printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
+ printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
vmcs, phys_addr);
}
-static void __vcpu_clear(void *arg)
+static void __loaded_vmcs_clear(void *arg)
{
- struct vcpu_vmx *vmx = arg;
+ struct loaded_vmcs *loaded_vmcs = arg;
int cpu = raw_smp_processor_id();
- if (vmx->vcpu.cpu == cpu)
- vmcs_clear(vmx->vmcs);
- if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
+ if (loaded_vmcs->cpu != cpu)
+ return; /* vcpu migration can race with cpu offline */
+ if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
per_cpu(current_vmcs, cpu) = NULL;
- list_del(&vmx->local_vcpus_link);
- vmx->vcpu.cpu = -1;
- vmx->launched = 0;
+ list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
+ loaded_vmcs_init(loaded_vmcs);
}
-static void vcpu_clear(struct vcpu_vmx *vmx)
+static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
{
- if (vmx->vcpu.cpu == -1)
- return;
- smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
+ if (loaded_vmcs->cpu != -1)
+ smp_call_function_single(
+ loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1);
}
static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
@@ -569,26 +1030,26 @@ static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
}
}
-static unsigned long vmcs_readl(unsigned long field)
+static __always_inline unsigned long vmcs_readl(unsigned long field)
{
- unsigned long value = 0;
+ unsigned long value;
- asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
- : "+a"(value) : "d"(field) : "cc");
+ asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0")
+ : "=a"(value) : "d"(field) : "cc");
return value;
}
-static u16 vmcs_read16(unsigned long field)
+static __always_inline u16 vmcs_read16(unsigned long field)
{
return vmcs_readl(field);
}
-static u32 vmcs_read32(unsigned long field)
+static __always_inline u32 vmcs_read32(unsigned long field)
{
return vmcs_readl(field);
}
-static u64 vmcs_read64(unsigned long field)
+static __always_inline u64 vmcs_read64(unsigned long field)
{
#ifdef CONFIG_X86_64
return vmcs_readl(field);
@@ -643,6 +1104,62 @@ static void vmcs_set_bits(unsigned long field, u32 mask)
vmcs_writel(field, vmcs_readl(field) | mask);
}
+static void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
+{
+ vmx->segment_cache.bitmask = 0;
+}
+
+static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
+ unsigned field)
+{
+ bool ret;
+ u32 mask = 1 << (seg * SEG_FIELD_NR + field);
+
+ if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
+ vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
+ vmx->segment_cache.bitmask = 0;
+ }
+ ret = vmx->segment_cache.bitmask & mask;
+ vmx->segment_cache.bitmask |= mask;
+ return ret;
+}
+
+static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
+{
+ u16 *p = &vmx->segment_cache.seg[seg].selector;
+
+ if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
+ *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
+ return *p;
+}
+
+static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
+{
+ ulong *p = &vmx->segment_cache.seg[seg].base;
+
+ if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
+ *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
+ return *p;
+}
+
+static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
+{
+ u32 *p = &vmx->segment_cache.seg[seg].limit;
+
+ if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
+ *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
+ return *p;
+}
+
+static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
+{
+ u32 *p = &vmx->segment_cache.seg[seg].ar;
+
+ if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
+ *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
+ return *p;
+}
+
static void update_exception_bitmap(struct kvm_vcpu *vcpu)
{
u32 eb;
@@ -659,6 +1176,15 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
if (vcpu->fpu_active)
eb &= ~(1u << NM_VECTOR);
+
+ /* When we are running a nested L2 guest and L1 specified for it a
+ * certain exception bitmap, we must trap the same exceptions and pass
+ * them to L1. When running L2, we will only handle the exceptions
+ * specified above if L1 did not want them.
+ */
+ if (is_guest_mode(vcpu))
+ eb |= get_vmcs12(vcpu)->exception_bitmap;
+
vmcs_write32(EXCEPTION_BITMAP, eb);
}
@@ -899,22 +1425,22 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (!vmm_exclusive)
kvm_cpu_vmxon(phys_addr);
- else if (vcpu->cpu != cpu)
- vcpu_clear(vmx);
+ else if (vmx->loaded_vmcs->cpu != cpu)
+ loaded_vmcs_clear(vmx->loaded_vmcs);
- if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
- per_cpu(current_vmcs, cpu) = vmx->vmcs;
- vmcs_load(vmx->vmcs);
+ if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
+ per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
+ vmcs_load(vmx->loaded_vmcs->vmcs);
}
- if (vcpu->cpu != cpu) {
+ if (vmx->loaded_vmcs->cpu != cpu) {
struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
unsigned long sysenter_esp;
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
local_irq_disable();
- list_add(&vmx->local_vcpus_link,
- &per_cpu(vcpus_on_cpu, cpu));
+ list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
+ &per_cpu(loaded_vmcss_on_cpu, cpu));
local_irq_enable();
/*
@@ -926,6 +1452,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
+ vmx->loaded_vmcs->cpu = cpu;
}
}
@@ -933,7 +1460,8 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{
__vmx_load_host_state(to_vmx(vcpu));
if (!vmm_exclusive) {
- __vcpu_clear(to_vmx(vcpu));
+ __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
+ vcpu->cpu = -1;
kvm_cpu_vmxoff();
}
}
@@ -951,36 +1479,79 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_CR0, cr0);
update_exception_bitmap(vcpu);
vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
+ if (is_guest_mode(vcpu))
+ vcpu->arch.cr0_guest_owned_bits &=
+ ~get_vmcs12(vcpu)->cr0_guest_host_mask;
vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
}
static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
+/*
+ * Return the cr0 value that a nested guest would read. This is a combination
+ * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
+ * its hypervisor (cr0_read_shadow).
+ */
+static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
+{
+ return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
+ (fields->cr0_read_shadow & fields->cr0_guest_host_mask);
+}
+static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
+{
+ return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
+ (fields->cr4_read_shadow & fields->cr4_guest_host_mask);
+}
+
static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
{
+ /* Note that there is no vcpu->fpu_active = 0 here. The caller must
+ * set this *before* calling this function.
+ */
vmx_decache_cr0_guest_bits(vcpu);
vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
update_exception_bitmap(vcpu);
vcpu->arch.cr0_guest_owned_bits = 0;
vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
- vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
+ if (is_guest_mode(vcpu)) {
+ /*
+ * L1's specified read shadow might not contain the TS bit,
+ * so now that we turned on shadowing of this bit, we need to
+ * set this bit of the shadow. Like in nested_vmx_run we need
+ * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet
+ * up-to-date here because we just decached cr0.TS (and we'll
+ * only update vmcs12->guest_cr0 on nested exit).
+ */
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) |
+ (vcpu->arch.cr0 & X86_CR0_TS);
+ vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
+ } else
+ vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
}
static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
{
unsigned long rflags, save_rflags;
- rflags = vmcs_readl(GUEST_RFLAGS);
- if (to_vmx(vcpu)->rmode.vm86_active) {
- rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
- save_rflags = to_vmx(vcpu)->rmode.save_rflags;
- rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
+ if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
+ __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
+ rflags = vmcs_readl(GUEST_RFLAGS);
+ if (to_vmx(vcpu)->rmode.vm86_active) {
+ rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
+ save_rflags = to_vmx(vcpu)->rmode.save_rflags;
+ rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
+ }
+ to_vmx(vcpu)->rflags = rflags;
}
- return rflags;
+ return to_vmx(vcpu)->rflags;
}
static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
+ __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
+ __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
+ to_vmx(vcpu)->rflags = rflags;
if (to_vmx(vcpu)->rmode.vm86_active) {
to_vmx(vcpu)->rmode.save_rflags = rflags;
rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
@@ -1040,6 +1611,25 @@ static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
}
+/*
+ * KVM wants to inject page-faults which it got to the guest. This function
+ * checks whether in a nested guest, we need to inject them to L1 or L2.
+ * This function assumes it is called with the exit reason in vmcs02 being
+ * a #PF exception (this is the only case in which KVM injects a #PF when L2
+ * is running).
+ */
+static int nested_pf_handled(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
+ if (!(vmcs12->exception_bitmap & PF_VECTOR))
+ return 0;
+
+ nested_vmx_vmexit(vcpu);
+ return 1;
+}
+
static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
bool has_error_code, u32 error_code,
bool reinject)
@@ -1047,13 +1637,20 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 intr_info = nr | INTR_INFO_VALID_MASK;
+ if (nr == PF_VECTOR && is_guest_mode(vcpu) &&
+ nested_pf_handled(vcpu))
+ return;
+
if (has_error_code) {
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
intr_info |= INTR_INFO_DELIVER_CODE_MASK;
}
if (vmx->rmode.vm86_active) {
- if (kvm_inject_realmode_interrupt(vcpu, nr) != EMULATE_DONE)
+ int inc_eip = 0;
+ if (kvm_exception_is_soft(nr))
+ inc_eip = vcpu->arch.event_exit_inst_len;
+ if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return;
}
@@ -1151,17 +1748,274 @@ static u64 guest_read_tsc(void)
}
/*
+ * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ
+ * ioctl. In this case the call-back should update internal vmx state to make
+ * the changes effective.
+ */
+static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
+{
+ /* Nothing to do here */
+}
+
+/*
* writes 'offset' into guest's timestamp counter offset register
*/
static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
vmcs_write64(TSC_OFFSET, offset);
+ if (is_guest_mode(vcpu))
+ /*
+ * We're here if L1 chose not to trap the TSC MSR. Since
+ * prepare_vmcs12() does not copy tsc_offset, we need to also
+ * set the vmcs12 field here.
+ */
+ get_vmcs12(vcpu)->tsc_offset = offset -
+ to_vmx(vcpu)->nested.vmcs01_tsc_offset;
}
static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
{
u64 offset = vmcs_read64(TSC_OFFSET);
vmcs_write64(TSC_OFFSET, offset + adjustment);
+ if (is_guest_mode(vcpu)) {
+ /* Even when running L2, the adjustment needs to apply to L1 */
+ to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment;
+ }
+}
+
+static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
+{
+ return target_tsc - native_read_tsc();
+}
+
+static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0);
+ return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31)));
+}
+
+/*
+ * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
+ * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
+ * all guests if the "nested" module option is off, and can also be disabled
+ * for a single guest by disabling its VMX cpuid bit.
+ */
+static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
+{
+ return nested && guest_cpuid_has_vmx(vcpu);
+}
+
+/*
+ * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
+ * returned for the various VMX controls MSRs when nested VMX is enabled.
+ * The same values should also be used to verify that vmcs12 control fields are
+ * valid during nested entry from L1 to L2.
+ * Each of these control msrs has a low and high 32-bit half: A low bit is on
+ * if the corresponding bit in the (32-bit) control field *must* be on, and a
+ * bit in the high half is on if the corresponding bit in the control field
+ * may be on. See also vmx_control_verify().
+ * TODO: allow these variables to be modified (downgraded) by module options
+ * or other means.
+ */
+static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high;
+static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
+static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
+static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
+static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
+static __init void nested_vmx_setup_ctls_msrs(void)
+{
+ /*
+ * Note that as a general rule, the high half of the MSRs (bits in
+ * the control fields which may be 1) should be initialized by the
+ * intersection of the underlying hardware's MSR (i.e., features which
+ * can be supported) and the list of features we want to expose -
+ * because they are known to be properly supported in our code.
+ * Also, usually, the low half of the MSRs (bits which must be 1) can
+ * be set to 0, meaning that L1 may turn off any of these bits. The
+ * reason is that if one of these bits is necessary, it will appear
+ * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
+ * fields of vmcs01 and vmcs02, will turn these bits off - and
+ * nested_vmx_exit_handled() will not pass related exits to L1.
+ * These rules have exceptions below.
+ */
+
+ /* pin-based controls */
+ /*
+ * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is
+ * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR.
+ */
+ nested_vmx_pinbased_ctls_low = 0x16 ;
+ nested_vmx_pinbased_ctls_high = 0x16 |
+ PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING |
+ PIN_BASED_VIRTUAL_NMIS;
+
+ /* exit controls */
+ nested_vmx_exit_ctls_low = 0;
+ /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */
+#ifdef CONFIG_X86_64
+ nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE;
+#else
+ nested_vmx_exit_ctls_high = 0;
+#endif
+
+ /* entry controls */
+ rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
+ nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high);
+ nested_vmx_entry_ctls_low = 0;
+ nested_vmx_entry_ctls_high &=
+ VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE;
+
+ /* cpu-based controls */
+ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
+ nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);
+ nested_vmx_procbased_ctls_low = 0;
+ nested_vmx_procbased_ctls_high &=
+ CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_USE_TSC_OFFSETING |
+ CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
+ CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_CR3_STORE_EXITING |
+#ifdef CONFIG_X86_64
+ CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
+#endif
+ CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
+ CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+ /*
+ * We can allow some features even when not supported by the
+ * hardware. For example, L1 can specify an MSR bitmap - and we
+ * can use it to avoid exits to L1 - even when L0 runs L2
+ * without MSR bitmaps.
+ */
+ nested_vmx_procbased_ctls_high |= CPU_BASED_USE_MSR_BITMAPS;
+
+ /* secondary cpu-based controls */
+ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
+ nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high);
+ nested_vmx_secondary_ctls_low = 0;
+ nested_vmx_secondary_ctls_high &=
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+}
+
+static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
+{
+ /*
+ * Bits 0 in high must be 0, and bits 1 in low must be 1.
+ */
+ return ((control & high) | low) == control;
+}
+
+static inline u64 vmx_control_msr(u32 low, u32 high)
+{
+ return low | ((u64)high << 32);
+}
+
+/*
+ * If we allow our guest to use VMX instructions (i.e., nested VMX), we should
+ * also let it use VMX-specific MSRs.
+ * vmx_get_vmx_msr() and vmx_set_vmx_msr() return 1 when we handled a
+ * VMX-specific MSR, or 0 when we haven't (and the caller should handle it
+ * like all other MSRs).
+ */
+static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+{
+ if (!nested_vmx_allowed(vcpu) && msr_index >= MSR_IA32_VMX_BASIC &&
+ msr_index <= MSR_IA32_VMX_TRUE_ENTRY_CTLS) {
+ /*
+ * According to the spec, processors which do not support VMX
+ * should throw a #GP(0) when VMX capability MSRs are read.
+ */
+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+ return 1;
+ }
+
+ switch (msr_index) {
+ case MSR_IA32_FEATURE_CONTROL:
+ *pdata = 0;
+ break;
+ case MSR_IA32_VMX_BASIC:
+ /*
+ * This MSR reports some information about VMX support. We
+ * should return information about the VMX we emulate for the
+ * guest, and the VMCS structure we give it - not about the
+ * VMX support of the underlying hardware.
+ */
+ *pdata = VMCS12_REVISION |
+ ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
+ (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
+ break;
+ case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
+ case MSR_IA32_VMX_PINBASED_CTLS:
+ *pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low,
+ nested_vmx_pinbased_ctls_high);
+ break;
+ case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
+ case MSR_IA32_VMX_PROCBASED_CTLS:
+ *pdata = vmx_control_msr(nested_vmx_procbased_ctls_low,
+ nested_vmx_procbased_ctls_high);
+ break;
+ case MSR_IA32_VMX_TRUE_EXIT_CTLS:
+ case MSR_IA32_VMX_EXIT_CTLS:
+ *pdata = vmx_control_msr(nested_vmx_exit_ctls_low,
+ nested_vmx_exit_ctls_high);
+ break;
+ case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
+ case MSR_IA32_VMX_ENTRY_CTLS:
+ *pdata = vmx_control_msr(nested_vmx_entry_ctls_low,
+ nested_vmx_entry_ctls_high);
+ break;
+ case MSR_IA32_VMX_MISC:
+ *pdata = 0;
+ break;
+ /*
+ * These MSRs specify bits which the guest must keep fixed (on or off)
+ * while L1 is in VMXON mode (in L1's root mode, or running an L2).
+ * We picked the standard core2 setting.
+ */
+#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
+#define VMXON_CR4_ALWAYSON X86_CR4_VMXE
+ case MSR_IA32_VMX_CR0_FIXED0:
+ *pdata = VMXON_CR0_ALWAYSON;
+ break;
+ case MSR_IA32_VMX_CR0_FIXED1:
+ *pdata = -1ULL;
+ break;
+ case MSR_IA32_VMX_CR4_FIXED0:
+ *pdata = VMXON_CR4_ALWAYSON;
+ break;
+ case MSR_IA32_VMX_CR4_FIXED1:
+ *pdata = -1ULL;
+ break;
+ case MSR_IA32_VMX_VMCS_ENUM:
+ *pdata = 0x1f;
+ break;
+ case MSR_IA32_VMX_PROCBASED_CTLS2:
+ *pdata = vmx_control_msr(nested_vmx_secondary_ctls_low,
+ nested_vmx_secondary_ctls_high);
+ break;
+ case MSR_IA32_VMX_EPT_VPID_CAP:
+ /* Currently, no nested ept or nested vpid */
+ *pdata = 0;
+ break;
+ default:
+ return 0;
+ }
+
+ return 1;
+}
+
+static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+{
+ if (!nested_vmx_allowed(vcpu))
+ return 0;
+
+ if (msr_index == MSR_IA32_FEATURE_CONTROL)
+ /* TODO: the right thing. */
+ return 1;
+ /*
+ * No need to treat VMX capability MSRs specially: If we don't handle
+ * them, handle_wrmsr will #GP(0), which is correct (they are readonly)
+ */
+ return 0;
}
/*
@@ -1212,6 +2066,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
/* Otherwise falls through */
default:
vmx_load_host_state(to_vmx(vcpu));
+ if (vmx_get_vmx_msr(vcpu, msr_index, pdata))
+ return 0;
msr = find_msr_entry(to_vmx(vcpu), msr_index);
if (msr) {
vmx_load_host_state(to_vmx(vcpu));
@@ -1243,9 +2099,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
break;
#ifdef CONFIG_X86_64
case MSR_FS_BASE:
+ vmx_segment_cache_clear(vmx);
vmcs_writel(GUEST_FS_BASE, data);
break;
case MSR_GS_BASE:
+ vmx_segment_cache_clear(vmx);
vmcs_writel(GUEST_GS_BASE, data);
break;
case MSR_KERNEL_GS_BASE:
@@ -1281,6 +2139,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
return 1;
/* Otherwise falls through */
default:
+ if (vmx_set_vmx_msr(vcpu, msr_index, data))
+ break;
msr = find_msr_entry(vmx, msr_index);
if (msr) {
vmx_load_host_state(vmx);
@@ -1370,7 +2230,7 @@ static int hardware_enable(void *garbage)
if (read_cr4() & X86_CR4_VMXE)
return -EBUSY;
- INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
+ INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
test_bits = FEATURE_CONTROL_LOCKED;
@@ -1394,14 +2254,14 @@ static int hardware_enable(void *garbage)
return 0;
}
-static void vmclear_local_vcpus(void)
+static void vmclear_local_loaded_vmcss(void)
{
int cpu = raw_smp_processor_id();
- struct vcpu_vmx *vmx, *n;
+ struct loaded_vmcs *v, *n;
- list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu),
- local_vcpus_link)
- __vcpu_clear(vmx);
+ list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
+ loaded_vmcss_on_cpu_link)
+ __loaded_vmcs_clear(v);
}
@@ -1416,7 +2276,7 @@ static void kvm_cpu_vmxoff(void)
static void hardware_disable(void *garbage)
{
if (vmm_exclusive) {
- vmclear_local_vcpus();
+ vmclear_local_loaded_vmcss();
kvm_cpu_vmxoff();
}
write_cr4(read_cr4() & ~X86_CR4_VMXE);
@@ -1597,6 +2457,18 @@ static void free_vmcs(struct vmcs *vmcs)
free_pages((unsigned long)vmcs, vmcs_config.order);
}
+/*
+ * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
+ */
+static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
+{
+ if (!loaded_vmcs->vmcs)
+ return;
+ loaded_vmcs_clear(loaded_vmcs);
+ free_vmcs(loaded_vmcs->vmcs);
+ loaded_vmcs->vmcs = NULL;
+}
+
static void free_kvm_area(void)
{
int cpu;
@@ -1657,6 +2529,9 @@ static __init int hardware_setup(void)
if (!cpu_has_vmx_ple())
ple_gap = 0;
+ if (nested)
+ nested_vmx_setup_ctls_msrs();
+
return alloc_kvm_area();
}
@@ -1689,6 +2564,8 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
vmx->emulation_required = 1;
vmx->rmode.vm86_active = 0;
+ vmx_segment_cache_clear(vmx);
+
vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector);
vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base);
vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit);
@@ -1712,6 +2589,8 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs);
fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs);
+ vmx_segment_cache_clear(vmx);
+
vmcs_write16(GUEST_SS_SELECTOR, 0);
vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
@@ -1775,6 +2654,8 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
}
+ vmx_segment_cache_clear(vmx);
+
vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR);
vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
@@ -1851,6 +2732,8 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
{
u32 guest_tr_ar;
+ vmx_segment_cache_clear(to_vmx(vcpu));
+
guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
@@ -1934,13 +2817,14 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
(unsigned long *)&vcpu->arch.regs_dirty);
}
-static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
unsigned long cr0,
struct kvm_vcpu *vcpu)
{
- vmx_decache_cr3(vcpu);
+ if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
+ vmx_decache_cr3(vcpu);
if (!(cr0 & X86_CR0_PG)) {
/* From paging/starting to nonpaging */
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
@@ -1998,6 +2882,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
vmcs_writel(CR0_READ_SHADOW, cr0);
vmcs_writel(GUEST_CR0, hw_cr0);
vcpu->arch.cr0 = cr0;
+ __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
}
static u64 construct_eptp(unsigned long root_hpa)
@@ -2030,11 +2915,23 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
vmcs_writel(GUEST_CR3, guest_cr3);
}
-static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ?
KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
+ if (cr4 & X86_CR4_VMXE) {
+ /*
+ * To use VMXON (and later other VMX instructions), a guest
+ * must first be able to turn on cr4.VMXE (see handle_vmon()).
+ * So basically the check on whether to allow nested VMX
+ * is here.
+ */
+ if (!nested_vmx_allowed(vcpu))
+ return 1;
+ } else if (to_vmx(vcpu)->nested.vmxon)
+ return 1;
+
vcpu->arch.cr4 = cr4;
if (enable_ept) {
if (!is_paging(vcpu)) {
@@ -2047,13 +2944,13 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
vmcs_writel(CR4_READ_SHADOW, cr4);
vmcs_writel(GUEST_CR4, hw_cr4);
+ return 0;
}
static void vmx_get_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
struct kvm_save_segment *save;
u32 ar;
@@ -2075,13 +2972,13 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
var->limit = save->limit;
ar = save->ar;
if (seg == VCPU_SREG_TR
- || var->selector == vmcs_read16(sf->selector))
+ || var->selector == vmx_read_guest_seg_selector(vmx, seg))
goto use_saved_rmode_seg;
}
- var->base = vmcs_readl(sf->base);
- var->limit = vmcs_read32(sf->limit);
- var->selector = vmcs_read16(sf->selector);
- ar = vmcs_read32(sf->ar_bytes);
+ var->base = vmx_read_guest_seg_base(vmx, seg);
+ var->limit = vmx_read_guest_seg_limit(vmx, seg);
+ var->selector = vmx_read_guest_seg_selector(vmx, seg);
+ ar = vmx_read_guest_seg_ar(vmx, seg);
use_saved_rmode_seg:
if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
ar = 0;
@@ -2098,27 +2995,37 @@ use_saved_rmode_seg:
static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
- struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
struct kvm_segment s;
if (to_vmx(vcpu)->rmode.vm86_active) {
vmx_get_segment(vcpu, &s, seg);
return s.base;
}
- return vmcs_readl(sf->base);
+ return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
}
-static int vmx_get_cpl(struct kvm_vcpu *vcpu)
+static int __vmx_get_cpl(struct kvm_vcpu *vcpu)
{
if (!is_protmode(vcpu))
return 0;
- if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */
+ if (!is_long_mode(vcpu)
+ && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */
return 3;
- return vmcs_read16(GUEST_CS_SELECTOR) & 3;
+ return vmx_read_guest_seg_selector(to_vmx(vcpu), VCPU_SREG_CS) & 3;
+}
+
+static int vmx_get_cpl(struct kvm_vcpu *vcpu)
+{
+ if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) {
+ __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
+ to_vmx(vcpu)->cpl = __vmx_get_cpl(vcpu);
+ }
+ return to_vmx(vcpu)->cpl;
}
+
static u32 vmx_segment_access_rights(struct kvm_segment *var)
{
u32 ar;
@@ -2148,6 +3055,8 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
u32 ar;
+ vmx_segment_cache_clear(vmx);
+
if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
vmcs_write16(sf->selector, var->selector);
vmx->rmode.tr.selector = var->selector;
@@ -2184,11 +3093,12 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
ar |= 0x1; /* Accessed */
vmcs_write32(sf->ar_bytes, ar);
+ __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
}
static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
{
- u32 ar = vmcs_read32(GUEST_CS_AR_BYTES);
+ u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
*db = (ar >> 14) & 1;
*l = (ar >> 13) & 1;
@@ -2600,18 +3510,110 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
}
/*
+ * Set up the vmcs's constant host-state fields, i.e., host-state fields that
+ * will not change in the lifetime of the guest.
+ * Note that host-state that does change is set elsewhere. E.g., host-state
+ * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
+ */
+static void vmx_set_constant_host_state(void)
+{
+ u32 low32, high32;
+ unsigned long tmpl;
+ struct desc_ptr dt;
+
+ vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */
+ vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
+ vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
+
+ vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
+ vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
+ vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
+ vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
+ vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
+
+ native_store_idt(&dt);
+ vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
+
+ asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl));
+ vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */
+
+ rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
+ vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
+ rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
+ vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
+
+ if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
+ rdmsr(MSR_IA32_CR_PAT, low32, high32);
+ vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
+ }
+}
+
+static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
+{
+ vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
+ if (enable_ept)
+ vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
+ if (is_guest_mode(&vmx->vcpu))
+ vmx->vcpu.arch.cr4_guest_owned_bits &=
+ ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
+ vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
+}
+
+static u32 vmx_exec_control(struct vcpu_vmx *vmx)
+{
+ u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
+ if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
+ exec_control &= ~CPU_BASED_TPR_SHADOW;
+#ifdef CONFIG_X86_64
+ exec_control |= CPU_BASED_CR8_STORE_EXITING |
+ CPU_BASED_CR8_LOAD_EXITING;
+#endif
+ }
+ if (!enable_ept)
+ exec_control |= CPU_BASED_CR3_STORE_EXITING |
+ CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_INVLPG_EXITING;
+ return exec_control;
+}
+
+static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
+{
+ u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
+ if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
+ exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ if (vmx->vpid == 0)
+ exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
+ if (!enable_ept) {
+ exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+ enable_unrestricted_guest = 0;
+ }
+ if (!enable_unrestricted_guest)
+ exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
+ if (!ple_gap)
+ exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+ return exec_control;
+}
+
+static void ept_set_mmio_spte_mask(void)
+{
+ /*
+ * EPT Misconfigurations can be generated if the value of bits 2:0
+ * of an EPT paging-structure entry is 110b (write/execute).
+ * Also, magic bits (0xffull << 49) is set to quickly identify mmio
+ * spte.
+ */
+ kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull);
+}
+
+/*
* Sets up the vmcs for emulated real mode.
*/
static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
{
- u32 host_sysenter_cs, msr_low, msr_high;
- u32 junk;
- u64 host_pat;
+#ifdef CONFIG_X86_64
unsigned long a;
- struct desc_ptr dt;
+#endif
int i;
- unsigned long kvm_vmx_return;
- u32 exec_control;
/* I/O */
vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
@@ -2626,36 +3628,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
vmcs_config.pin_based_exec_ctrl);
- exec_control = vmcs_config.cpu_based_exec_ctrl;
- if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
- exec_control &= ~CPU_BASED_TPR_SHADOW;
-#ifdef CONFIG_X86_64
- exec_control |= CPU_BASED_CR8_STORE_EXITING |
- CPU_BASED_CR8_LOAD_EXITING;
-#endif
- }
- if (!enable_ept)
- exec_control |= CPU_BASED_CR3_STORE_EXITING |
- CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_INVLPG_EXITING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
if (cpu_has_secondary_exec_ctrls()) {
- exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
- if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
- exec_control &=
- ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
- if (vmx->vpid == 0)
- exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
- if (!enable_ept) {
- exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
- enable_unrestricted_guest = 0;
- }
- if (!enable_unrestricted_guest)
- exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
- if (!ple_gap)
- exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+ vmx_secondary_exec_control(vmx));
}
if (ple_gap) {
@@ -2663,20 +3640,13 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write32(PLE_WINDOW, ple_window);
}
- vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
- vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
- vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */
- vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
- vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
-
- vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
- vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
- vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */
- vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
+ vmx_set_constant_host_state();
#ifdef CONFIG_X86_64
rdmsrl(MSR_FS_BASE, a);
vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
@@ -2687,32 +3657,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
#endif
- vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
-
- native_store_idt(&dt);
- vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
-
- asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
- vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
- rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
- vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
- rdmsrl(MSR_IA32_SYSENTER_ESP, a);
- vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */
- rdmsrl(MSR_IA32_SYSENTER_EIP, a);
- vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */
-
- if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
- rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
- host_pat = msr_low | ((u64) msr_high << 32);
- vmcs_write64(HOST_IA32_PAT, host_pat);
- }
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+ u32 msr_low, msr_high;
+ u64 host_pat;
rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
host_pat = msr_low | ((u64) msr_high << 32);
/* Write the default value follow host pat */
@@ -2742,10 +3695,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
- vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
- if (enable_ept)
- vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
- vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
+ set_cr4_guest_host_mask(vmx);
kvm_write_tsc(&vmx->vcpu, 0);
@@ -2775,6 +3725,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
if (ret != 0)
goto out;
+ vmx_segment_cache_clear(vmx);
+
seg_setup(VCPU_SREG_CS);
/*
* GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
@@ -2867,9 +3819,25 @@ out:
return ret;
}
+/*
+ * In nested virtualization, check if L1 asked to exit on external interrupts.
+ * For most existing hypervisors, this will always return true.
+ */
+static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
+{
+ return get_vmcs12(vcpu)->pin_based_vm_exec_control &
+ PIN_BASED_EXT_INTR_MASK;
+}
+
static void enable_irq_window(struct kvm_vcpu *vcpu)
{
u32 cpu_based_vm_exec_control;
+ if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+ /* We can get here when nested_run_pending caused
+ * vmx_interrupt_allowed() to return false. In this case, do
+ * nothing - the interrupt will be injected later.
+ */
+ return;
cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
@@ -2904,7 +3872,10 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
++vcpu->stat.irq_injections;
if (vmx->rmode.vm86_active) {
- if (kvm_inject_realmode_interrupt(vcpu, irq) != EMULATE_DONE)
+ int inc_eip = 0;
+ if (vcpu->arch.interrupt.soft)
+ inc_eip = vcpu->arch.event_exit_inst_len;
+ if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE)
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return;
}
@@ -2923,6 +3894,9 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (is_guest_mode(vcpu))
+ return;
+
if (!cpu_has_virtual_nmis()) {
/*
* Tracking the NMI-blocked state in software is built upon
@@ -2937,8 +3911,9 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
}
++vcpu->stat.nmi_injections;
+ vmx->nmi_known_unmasked = false;
if (vmx->rmode.vm86_active) {
- if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR) != EMULATE_DONE)
+ if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return;
}
@@ -2961,6 +3936,8 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
{
if (!cpu_has_virtual_nmis())
return to_vmx(vcpu)->soft_vnmi_blocked;
+ if (to_vmx(vcpu)->nmi_known_unmasked)
+ return false;
return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
}
@@ -2974,6 +3951,7 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
vmx->vnmi_blocked_time = 0;
}
} else {
+ vmx->nmi_known_unmasked = !masked;
if (masked)
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
GUEST_INTR_STATE_NMI);
@@ -2985,6 +3963,17 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
{
+ if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
+ struct vmcs12 *vmcs12;
+ if (to_vmx(vcpu)->nested.nested_run_pending)
+ return 0;
+ nested_vmx_vmexit(vcpu);
+ vmcs12 = get_vmcs12(vcpu);
+ vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT;
+ vmcs12->vm_exit_intr_info = 0;
+ /* fall through to normal code, but now in L1, not L2 */
+ }
+
return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
@@ -3091,7 +4080,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
enum emulation_result er;
vect_info = vmx->idt_vectoring_info;
- intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ intr_info = vmx->exit_intr_info;
if (is_machine_check(intr_info))
return handle_machine_check(vcpu);
@@ -3122,7 +4111,6 @@ static int handle_exception(struct kvm_vcpu *vcpu)
}
error_code = 0;
- rip = kvm_rip_read(vcpu);
if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
if (is_page_fault(intr_info)) {
@@ -3169,6 +4157,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
vmx->vcpu.arch.event_exit_inst_len =
vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ rip = kvm_rip_read(vcpu);
kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
kvm_run->debug.arch.exception = ex_no;
break;
@@ -3226,6 +4215,58 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
hypercall[2] = 0xc1;
}
+/* called to set cr0 as approriate for a mov-to-cr0 exit. */
+static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ if (to_vmx(vcpu)->nested.vmxon &&
+ ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON))
+ return 1;
+
+ if (is_guest_mode(vcpu)) {
+ /*
+ * We get here when L2 changed cr0 in a way that did not change
+ * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
+ * but did change L0 shadowed bits. This can currently happen
+ * with the TS bit: L0 may want to leave TS on (for lazy fpu
+ * loading) while pretending to allow the guest to change it.
+ */
+ if (kvm_set_cr0(vcpu, (val & vcpu->arch.cr0_guest_owned_bits) |
+ (vcpu->arch.cr0 & ~vcpu->arch.cr0_guest_owned_bits)))
+ return 1;
+ vmcs_writel(CR0_READ_SHADOW, val);
+ return 0;
+ } else
+ return kvm_set_cr0(vcpu, val);
+}
+
+static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ if (is_guest_mode(vcpu)) {
+ if (kvm_set_cr4(vcpu, (val & vcpu->arch.cr4_guest_owned_bits) |
+ (vcpu->arch.cr4 & ~vcpu->arch.cr4_guest_owned_bits)))
+ return 1;
+ vmcs_writel(CR4_READ_SHADOW, val);
+ return 0;
+ } else
+ return kvm_set_cr4(vcpu, val);
+}
+
+/* called to set cr0 as approriate for clts instruction exit. */
+static void handle_clts(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu)) {
+ /*
+ * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS
+ * but we did (!fpu_active). We need to keep GUEST_CR0.TS on,
+ * just pretend it's off (also in arch.cr0 for fpu_activate).
+ */
+ vmcs_writel(CR0_READ_SHADOW,
+ vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS);
+ vcpu->arch.cr0 &= ~X86_CR0_TS;
+ } else
+ vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
+}
+
static int handle_cr(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification, val;
@@ -3242,7 +4283,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
trace_kvm_cr_write(cr, val);
switch (cr) {
case 0:
- err = kvm_set_cr0(vcpu, val);
+ err = handle_set_cr0(vcpu, val);
kvm_complete_insn_gp(vcpu, err);
return 1;
case 3:
@@ -3250,7 +4291,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
kvm_complete_insn_gp(vcpu, err);
return 1;
case 4:
- err = kvm_set_cr4(vcpu, val);
+ err = handle_set_cr4(vcpu, val);
kvm_complete_insn_gp(vcpu, err);
return 1;
case 8: {
@@ -3268,7 +4309,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
};
break;
case 2: /* clts */
- vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
+ handle_clts(vcpu);
trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
skip_emulated_instruction(vcpu);
vmx_fpu_activate(vcpu);
@@ -3444,12 +4485,6 @@ static int handle_vmcall(struct kvm_vcpu *vcpu)
return 1;
}
-static int handle_vmx_insn(struct kvm_vcpu *vcpu)
-{
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 1;
-}
-
static int handle_invd(struct kvm_vcpu *vcpu)
{
return emulate_instruction(vcpu, 0) == EMULATE_DONE;
@@ -3505,9 +4540,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
switch (type) {
case INTR_TYPE_NMI_INTR:
vcpu->arch.nmi_injected = false;
- if (cpu_has_virtual_nmis())
- vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
- GUEST_INTR_STATE_NMI);
+ vmx_set_nmi_mask(vcpu, true);
break;
case INTR_TYPE_EXT_INTR:
case INTR_TYPE_SOFT_INTR:
@@ -3649,11 +4682,19 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
{
u64 sptes[4];
- int nr_sptes, i;
+ int nr_sptes, i, ret;
gpa_t gpa;
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+ ret = handle_mmio_page_fault_common(vcpu, gpa, true);
+ if (likely(ret == 1))
+ return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
+ EMULATE_DONE;
+ if (unlikely(!ret))
+ return 1;
+
+ /* It is the real ept misconfig */
printk(KERN_ERR "EPT: Misconfiguration.\n");
printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);
@@ -3738,6 +4779,639 @@ static int handle_invalid_op(struct kvm_vcpu *vcpu)
}
/*
+ * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
+ * We could reuse a single VMCS for all the L2 guests, but we also want the
+ * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
+ * allows keeping them loaded on the processor, and in the future will allow
+ * optimizations where prepare_vmcs02 doesn't need to set all the fields on
+ * every entry if they never change.
+ * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
+ * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
+ *
+ * The following functions allocate and free a vmcs02 in this pool.
+ */
+
+/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
+static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
+{
+ struct vmcs02_list *item;
+ list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
+ if (item->vmptr == vmx->nested.current_vmptr) {
+ list_move(&item->list, &vmx->nested.vmcs02_pool);
+ return &item->vmcs02;
+ }
+
+ if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
+ /* Recycle the least recently used VMCS. */
+ item = list_entry(vmx->nested.vmcs02_pool.prev,
+ struct vmcs02_list, list);
+ item->vmptr = vmx->nested.current_vmptr;
+ list_move(&item->list, &vmx->nested.vmcs02_pool);
+ return &item->vmcs02;
+ }
+
+ /* Create a new VMCS */
+ item = (struct vmcs02_list *)
+ kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
+ if (!item)
+ return NULL;
+ item->vmcs02.vmcs = alloc_vmcs();
+ if (!item->vmcs02.vmcs) {
+ kfree(item);
+ return NULL;
+ }
+ loaded_vmcs_init(&item->vmcs02);
+ item->vmptr = vmx->nested.current_vmptr;
+ list_add(&(item->list), &(vmx->nested.vmcs02_pool));
+ vmx->nested.vmcs02_num++;
+ return &item->vmcs02;
+}
+
+/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
+static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
+{
+ struct vmcs02_list *item;
+ list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
+ if (item->vmptr == vmptr) {
+ free_loaded_vmcs(&item->vmcs02);
+ list_del(&item->list);
+ kfree(item);
+ vmx->nested.vmcs02_num--;
+ return;
+ }
+}
+
+/*
+ * Free all VMCSs saved for this vcpu, except the one pointed by
+ * vmx->loaded_vmcs. These include the VMCSs in vmcs02_pool (except the one
+ * currently used, if running L2), and vmcs01 when running L2.
+ */
+static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
+{
+ struct vmcs02_list *item, *n;
+ list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
+ if (vmx->loaded_vmcs != &item->vmcs02)
+ free_loaded_vmcs(&item->vmcs02);
+ list_del(&item->list);
+ kfree(item);
+ }
+ vmx->nested.vmcs02_num = 0;
+
+ if (vmx->loaded_vmcs != &vmx->vmcs01)
+ free_loaded_vmcs(&vmx->vmcs01);
+}
+
+/*
+ * Emulate the VMXON instruction.
+ * Currently, we just remember that VMX is active, and do not save or even
+ * inspect the argument to VMXON (the so-called "VMXON pointer") because we
+ * do not currently need to store anything in that guest-allocated memory
+ * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
+ * argument is different from the VMXON pointer (which the spec says they do).
+ */
+static int handle_vmon(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment cs;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /* The Intel VMX Instruction Reference lists a bunch of bits that
+ * are prerequisite to running VMXON, most notably cr4.VMXE must be
+ * set to 1 (see vmx_set_cr4() for when we allow the guest to set this).
+ * Otherwise, we should fail with #UD. We test these now:
+ */
+ if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE) ||
+ !kvm_read_cr0_bits(vcpu, X86_CR0_PE) ||
+ (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+ if (is_long_mode(vcpu) && !cs.l) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ if (vmx_get_cpl(vcpu)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
+ vmx->nested.vmcs02_num = 0;
+
+ vmx->nested.vmxon = true;
+
+ skip_emulated_instruction(vcpu);
+ return 1;
+}
+
+/*
+ * Intel's VMX Instruction Reference specifies a common set of prerequisites
+ * for running VMX instructions (except VMXON, whose prerequisites are
+ * slightly different). It also specifies what exception to inject otherwise.
+ */
+static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment cs;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!vmx->nested.vmxon) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 0;
+ }
+
+ vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+ if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) ||
+ (is_long_mode(vcpu) && !cs.l)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 0;
+ }
+
+ if (vmx_get_cpl(vcpu)) {
+ kvm_inject_gp(vcpu, 0);
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * Free whatever needs to be freed from vmx->nested when L1 goes down, or
+ * just stops using VMX.
+ */
+static void free_nested(struct vcpu_vmx *vmx)
+{
+ if (!vmx->nested.vmxon)
+ return;
+ vmx->nested.vmxon = false;
+ if (vmx->nested.current_vmptr != -1ull) {
+ kunmap(vmx->nested.current_vmcs12_page);
+ nested_release_page(vmx->nested.current_vmcs12_page);
+ vmx->nested.current_vmptr = -1ull;
+ vmx->nested.current_vmcs12 = NULL;
+ }
+ /* Unpin physical memory we referred to in current vmcs02 */
+ if (vmx->nested.apic_access_page) {
+ nested_release_page(vmx->nested.apic_access_page);
+ vmx->nested.apic_access_page = 0;
+ }
+
+ nested_free_all_saved_vmcss(vmx);
+}
+
+/* Emulate the VMXOFF instruction */
+static int handle_vmoff(struct kvm_vcpu *vcpu)
+{
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+ free_nested(to_vmx(vcpu));
+ skip_emulated_instruction(vcpu);
+ return 1;
+}
+
+/*
+ * Decode the memory-address operand of a vmx instruction, as recorded on an
+ * exit caused by such an instruction (run by a guest hypervisor).
+ * On success, returns 0. When the operand is invalid, returns 1 and throws
+ * #UD or #GP.
+ */
+static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
+ unsigned long exit_qualification,
+ u32 vmx_instruction_info, gva_t *ret)
+{
+ /*
+ * According to Vol. 3B, "Information for VM Exits Due to Instruction
+ * Execution", on an exit, vmx_instruction_info holds most of the
+ * addressing components of the operand. Only the displacement part
+ * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
+ * For how an actual address is calculated from all these components,
+ * refer to Vol. 1, "Operand Addressing".
+ */
+ int scaling = vmx_instruction_info & 3;
+ int addr_size = (vmx_instruction_info >> 7) & 7;
+ bool is_reg = vmx_instruction_info & (1u << 10);
+ int seg_reg = (vmx_instruction_info >> 15) & 7;
+ int index_reg = (vmx_instruction_info >> 18) & 0xf;
+ bool index_is_valid = !(vmx_instruction_info & (1u << 22));
+ int base_reg = (vmx_instruction_info >> 23) & 0xf;
+ bool base_is_valid = !(vmx_instruction_info & (1u << 27));
+
+ if (is_reg) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ /* Addr = segment_base + offset */
+ /* offset = base + [index * scale] + displacement */
+ *ret = vmx_get_segment_base(vcpu, seg_reg);
+ if (base_is_valid)
+ *ret += kvm_register_read(vcpu, base_reg);
+ if (index_is_valid)
+ *ret += kvm_register_read(vcpu, index_reg)<<scaling;
+ *ret += exit_qualification; /* holds the displacement */
+
+ if (addr_size == 1) /* 32 bit */
+ *ret &= 0xffffffff;
+
+ /*
+ * TODO: throw #GP (and return 1) in various cases that the VM*
+ * instructions require it - e.g., offset beyond segment limit,
+ * unusable or unreadable/unwritable segment, non-canonical 64-bit
+ * address, and so on. Currently these are not checked.
+ */
+ return 0;
+}
+
+/*
+ * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
+ * set the success or error code of an emulated VMX instruction, as specified
+ * by Vol 2B, VMX Instruction Reference, "Conventions".
+ */
+static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
+{
+ vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
+ & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+ X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
+}
+
+static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
+{
+ vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
+ & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
+ X86_EFLAGS_SF | X86_EFLAGS_OF))
+ | X86_EFLAGS_CF);
+}
+
+static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
+ u32 vm_instruction_error)
+{
+ if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
+ /*
+ * failValid writes the error number to the current VMCS, which
+ * can't be done there isn't a current VMCS.
+ */
+ nested_vmx_failInvalid(vcpu);
+ return;
+ }
+ vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
+ & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+ X86_EFLAGS_SF | X86_EFLAGS_OF))
+ | X86_EFLAGS_ZF);
+ get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
+}
+
+/* Emulate the VMCLEAR instruction */
+static int handle_vmclear(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ gva_t gva;
+ gpa_t vmptr;
+ struct vmcs12 *vmcs12;
+ struct page *page;
+ struct x86_exception e;
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+ vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
+ return 1;
+
+ if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
+ sizeof(vmptr), &e)) {
+ kvm_inject_page_fault(vcpu, &e);
+ return 1;
+ }
+
+ if (!IS_ALIGNED(vmptr, PAGE_SIZE)) {
+ nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
+ skip_emulated_instruction(vcpu);
+ return 1;
+ }
+
+ if (vmptr == vmx->nested.current_vmptr) {
+ kunmap(vmx->nested.current_vmcs12_page);
+ nested_release_page(vmx->nested.current_vmcs12_page);
+ vmx->nested.current_vmptr = -1ull;
+ vmx->nested.current_vmcs12 = NULL;
+ }
+
+ page = nested_get_page(vcpu, vmptr);
+ if (page == NULL) {
+ /*
+ * For accurate processor emulation, VMCLEAR beyond available
+ * physical memory should do nothing at all. However, it is
+ * possible that a nested vmx bug, not a guest hypervisor bug,
+ * resulted in this case, so let's shut down before doing any
+ * more damage:
+ */
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ return 1;
+ }
+ vmcs12 = kmap(page);
+ vmcs12->launch_state = 0;
+ kunmap(page);
+ nested_release_page(page);
+
+ nested_free_vmcs02(vmx, vmptr);
+
+ skip_emulated_instruction(vcpu);
+ nested_vmx_succeed(vcpu);
+ return 1;
+}
+
+static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
+
+/* Emulate the VMLAUNCH instruction */
+static int handle_vmlaunch(struct kvm_vcpu *vcpu)
+{
+ return nested_vmx_run(vcpu, true);
+}
+
+/* Emulate the VMRESUME instruction */
+static int handle_vmresume(struct kvm_vcpu *vcpu)
+{
+
+ return nested_vmx_run(vcpu, false);
+}
+
+enum vmcs_field_type {
+ VMCS_FIELD_TYPE_U16 = 0,
+ VMCS_FIELD_TYPE_U64 = 1,
+ VMCS_FIELD_TYPE_U32 = 2,
+ VMCS_FIELD_TYPE_NATURAL_WIDTH = 3
+};
+
+static inline int vmcs_field_type(unsigned long field)
+{
+ if (0x1 & field) /* the *_HIGH fields are all 32 bit */
+ return VMCS_FIELD_TYPE_U32;
+ return (field >> 13) & 0x3 ;
+}
+
+static inline int vmcs_field_readonly(unsigned long field)
+{
+ return (((field >> 10) & 0x3) == 1);
+}
+
+/*
+ * Read a vmcs12 field. Since these can have varying lengths and we return
+ * one type, we chose the biggest type (u64) and zero-extend the return value
+ * to that size. Note that the caller, handle_vmread, might need to use only
+ * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
+ * 64-bit fields are to be returned).
+ */
+static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu,
+ unsigned long field, u64 *ret)
+{
+ short offset = vmcs_field_to_offset(field);
+ char *p;
+
+ if (offset < 0)
+ return 0;
+
+ p = ((char *)(get_vmcs12(vcpu))) + offset;
+
+ switch (vmcs_field_type(field)) {
+ case VMCS_FIELD_TYPE_NATURAL_WIDTH:
+ *ret = *((natural_width *)p);
+ return 1;
+ case VMCS_FIELD_TYPE_U16:
+ *ret = *((u16 *)p);
+ return 1;
+ case VMCS_FIELD_TYPE_U32:
+ *ret = *((u32 *)p);
+ return 1;
+ case VMCS_FIELD_TYPE_U64:
+ *ret = *((u64 *)p);
+ return 1;
+ default:
+ return 0; /* can never happen. */
+ }
+}
+
+/*
+ * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was
+ * used before) all generate the same failure when it is missing.
+ */
+static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (vmx->nested.current_vmptr == -1ull) {
+ nested_vmx_failInvalid(vcpu);
+ skip_emulated_instruction(vcpu);
+ return 0;
+ }
+ return 1;
+}
+
+static int handle_vmread(struct kvm_vcpu *vcpu)
+{
+ unsigned long field;
+ u64 field_value;
+ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ gva_t gva = 0;
+
+ if (!nested_vmx_check_permission(vcpu) ||
+ !nested_vmx_check_vmcs12(vcpu))
+ return 1;
+
+ /* Decode instruction info and find the field to read */
+ field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
+ /* Read the field, zero-extended to a u64 field_value */
+ if (!vmcs12_read_any(vcpu, field, &field_value)) {
+ nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+ skip_emulated_instruction(vcpu);
+ return 1;
+ }
+ /*
+ * Now copy part of this value to register or memory, as requested.
+ * Note that the number of bits actually copied is 32 or 64 depending
+ * on the guest's mode (32 or 64 bit), not on the given field's length.
+ */
+ if (vmx_instruction_info & (1u << 10)) {
+ kvm_register_write(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
+ field_value);
+ } else {
+ if (get_vmx_mem_address(vcpu, exit_qualification,
+ vmx_instruction_info, &gva))
+ return 1;
+ /* _system ok, as nested_vmx_check_permission verified cpl=0 */
+ kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva,
+ &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL);
+ }
+
+ nested_vmx_succeed(vcpu);
+ skip_emulated_instruction(vcpu);
+ return 1;
+}
+
+
+static int handle_vmwrite(struct kvm_vcpu *vcpu)
+{
+ unsigned long field;
+ gva_t gva;
+ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ char *p;
+ short offset;
+ /* The value to write might be 32 or 64 bits, depending on L1's long
+ * mode, and eventually we need to write that into a field of several
+ * possible lengths. The code below first zero-extends the value to 64
+ * bit (field_value), and then copies only the approriate number of
+ * bits into the vmcs12 field.
+ */
+ u64 field_value = 0;
+ struct x86_exception e;
+
+ if (!nested_vmx_check_permission(vcpu) ||
+ !nested_vmx_check_vmcs12(vcpu))
+ return 1;
+
+ if (vmx_instruction_info & (1u << 10))
+ field_value = kvm_register_read(vcpu,
+ (((vmx_instruction_info) >> 3) & 0xf));
+ else {
+ if (get_vmx_mem_address(vcpu, exit_qualification,
+ vmx_instruction_info, &gva))
+ return 1;
+ if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva,
+ &field_value, (is_long_mode(vcpu) ? 8 : 4), &e)) {
+ kvm_inject_page_fault(vcpu, &e);
+ return 1;
+ }
+ }
+
+
+ field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
+ if (vmcs_field_readonly(field)) {
+ nested_vmx_failValid(vcpu,
+ VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
+ skip_emulated_instruction(vcpu);
+ return 1;
+ }
+
+ offset = vmcs_field_to_offset(field);
+ if (offset < 0) {
+ nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+ skip_emulated_instruction(vcpu);
+ return 1;
+ }
+ p = ((char *) get_vmcs12(vcpu)) + offset;
+
+ switch (vmcs_field_type(field)) {
+ case VMCS_FIELD_TYPE_U16:
+ *(u16 *)p = field_value;
+ break;
+ case VMCS_FIELD_TYPE_U32:
+ *(u32 *)p = field_value;
+ break;
+ case VMCS_FIELD_TYPE_U64:
+ *(u64 *)p = field_value;
+ break;
+ case VMCS_FIELD_TYPE_NATURAL_WIDTH:
+ *(natural_width *)p = field_value;
+ break;
+ default:
+ nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+ skip_emulated_instruction(vcpu);
+ return 1;
+ }
+
+ nested_vmx_succeed(vcpu);
+ skip_emulated_instruction(vcpu);
+ return 1;
+}
+
+/* Emulate the VMPTRLD instruction */
+static int handle_vmptrld(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ gva_t gva;
+ gpa_t vmptr;
+ struct x86_exception e;
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+ vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
+ return 1;
+
+ if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
+ sizeof(vmptr), &e)) {
+ kvm_inject_page_fault(vcpu, &e);
+ return 1;
+ }
+
+ if (!IS_ALIGNED(vmptr, PAGE_SIZE)) {
+ nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
+ skip_emulated_instruction(vcpu);
+ return 1;
+ }
+
+ if (vmx->nested.current_vmptr != vmptr) {
+ struct vmcs12 *new_vmcs12;
+ struct page *page;
+ page = nested_get_page(vcpu, vmptr);
+ if (page == NULL) {
+ nested_vmx_failInvalid(vcpu);
+ skip_emulated_instruction(vcpu);
+ return 1;
+ }
+ new_vmcs12 = kmap(page);
+ if (new_vmcs12->revision_id != VMCS12_REVISION) {
+ kunmap(page);
+ nested_release_page_clean(page);
+ nested_vmx_failValid(vcpu,
+ VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
+ skip_emulated_instruction(vcpu);
+ return 1;
+ }
+ if (vmx->nested.current_vmptr != -1ull) {
+ kunmap(vmx->nested.current_vmcs12_page);
+ nested_release_page(vmx->nested.current_vmcs12_page);
+ }
+
+ vmx->nested.current_vmptr = vmptr;
+ vmx->nested.current_vmcs12 = new_vmcs12;
+ vmx->nested.current_vmcs12_page = page;
+ }
+
+ nested_vmx_succeed(vcpu);
+ skip_emulated_instruction(vcpu);
+ return 1;
+}
+
+/* Emulate the VMPTRST instruction */
+static int handle_vmptrst(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ gva_t vmcs_gva;
+ struct x86_exception e;
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ if (get_vmx_mem_address(vcpu, exit_qualification,
+ vmx_instruction_info, &vmcs_gva))
+ return 1;
+ /* ok to use *_system, as nested_vmx_check_permission verified cpl=0 */
+ if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva,
+ (void *)&to_vmx(vcpu)->nested.current_vmptr,
+ sizeof(u64), &e)) {
+ kvm_inject_page_fault(vcpu, &e);
+ return 1;
+ }
+ nested_vmx_succeed(vcpu);
+ skip_emulated_instruction(vcpu);
+ return 1;
+}
+
+/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
* to be done to userspace and return 0.
@@ -3758,15 +5432,15 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_INVD] = handle_invd,
[EXIT_REASON_INVLPG] = handle_invlpg,
[EXIT_REASON_VMCALL] = handle_vmcall,
- [EXIT_REASON_VMCLEAR] = handle_vmx_insn,
- [EXIT_REASON_VMLAUNCH] = handle_vmx_insn,
- [EXIT_REASON_VMPTRLD] = handle_vmx_insn,
- [EXIT_REASON_VMPTRST] = handle_vmx_insn,
- [EXIT_REASON_VMREAD] = handle_vmx_insn,
- [EXIT_REASON_VMRESUME] = handle_vmx_insn,
- [EXIT_REASON_VMWRITE] = handle_vmx_insn,
- [EXIT_REASON_VMOFF] = handle_vmx_insn,
- [EXIT_REASON_VMON] = handle_vmx_insn,
+ [EXIT_REASON_VMCLEAR] = handle_vmclear,
+ [EXIT_REASON_VMLAUNCH] = handle_vmlaunch,
+ [EXIT_REASON_VMPTRLD] = handle_vmptrld,
+ [EXIT_REASON_VMPTRST] = handle_vmptrst,
+ [EXIT_REASON_VMREAD] = handle_vmread,
+ [EXIT_REASON_VMRESUME] = handle_vmresume,
+ [EXIT_REASON_VMWRITE] = handle_vmwrite,
+ [EXIT_REASON_VMOFF] = handle_vmoff,
+ [EXIT_REASON_VMON] = handle_vmon,
[EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
[EXIT_REASON_APIC_ACCESS] = handle_apic_access,
[EXIT_REASON_WBINVD] = handle_wbinvd,
@@ -3783,6 +5457,229 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
static const int kvm_vmx_max_exit_handlers =
ARRAY_SIZE(kvm_vmx_exit_handlers);
+/*
+ * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
+ * rather than handle it ourselves in L0. I.e., check whether L1 expressed
+ * disinterest in the current event (read or write a specific MSR) by using an
+ * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
+ */
+static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12, u32 exit_reason)
+{
+ u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
+ gpa_t bitmap;
+
+ if (!nested_cpu_has(get_vmcs12(vcpu), CPU_BASED_USE_MSR_BITMAPS))
+ return 1;
+
+ /*
+ * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
+ * for the four combinations of read/write and low/high MSR numbers.
+ * First we need to figure out which of the four to use:
+ */
+ bitmap = vmcs12->msr_bitmap;
+ if (exit_reason == EXIT_REASON_MSR_WRITE)
+ bitmap += 2048;
+ if (msr_index >= 0xc0000000) {
+ msr_index -= 0xc0000000;
+ bitmap += 1024;
+ }
+
+ /* Then read the msr_index'th bit from this bitmap: */
+ if (msr_index < 1024*8) {
+ unsigned char b;
+ kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1);
+ return 1 & (b >> (msr_index & 7));
+ } else
+ return 1; /* let L1 handle the wrong parameter */
+}
+
+/*
+ * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
+ * rather than handle it ourselves in L0. I.e., check if L1 wanted to
+ * intercept (via guest_host_mask etc.) the current event.
+ */
+static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ int cr = exit_qualification & 15;
+ int reg = (exit_qualification >> 8) & 15;
+ unsigned long val = kvm_register_read(vcpu, reg);
+
+ switch ((exit_qualification >> 4) & 3) {
+ case 0: /* mov to cr */
+ switch (cr) {
+ case 0:
+ if (vmcs12->cr0_guest_host_mask &
+ (val ^ vmcs12->cr0_read_shadow))
+ return 1;
+ break;
+ case 3:
+ if ((vmcs12->cr3_target_count >= 1 &&
+ vmcs12->cr3_target_value0 == val) ||
+ (vmcs12->cr3_target_count >= 2 &&
+ vmcs12->cr3_target_value1 == val) ||
+ (vmcs12->cr3_target_count >= 3 &&
+ vmcs12->cr3_target_value2 == val) ||
+ (vmcs12->cr3_target_count >= 4 &&
+ vmcs12->cr3_target_value3 == val))
+ return 0;
+ if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
+ return 1;
+ break;
+ case 4:
+ if (vmcs12->cr4_guest_host_mask &
+ (vmcs12->cr4_read_shadow ^ val))
+ return 1;
+ break;
+ case 8:
+ if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
+ return 1;
+ break;
+ }
+ break;
+ case 2: /* clts */
+ if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
+ (vmcs12->cr0_read_shadow & X86_CR0_TS))
+ return 1;
+ break;
+ case 1: /* mov from cr */
+ switch (cr) {
+ case 3:
+ if (vmcs12->cpu_based_vm_exec_control &
+ CPU_BASED_CR3_STORE_EXITING)
+ return 1;
+ break;
+ case 8:
+ if (vmcs12->cpu_based_vm_exec_control &
+ CPU_BASED_CR8_STORE_EXITING)
+ return 1;
+ break;
+ }
+ break;
+ case 3: /* lmsw */
+ /*
+ * lmsw can change bits 1..3 of cr0, and only set bit 0 of
+ * cr0. Other attempted changes are ignored, with no exit.
+ */
+ if (vmcs12->cr0_guest_host_mask & 0xe &
+ (val ^ vmcs12->cr0_read_shadow))
+ return 1;
+ if ((vmcs12->cr0_guest_host_mask & 0x1) &&
+ !(vmcs12->cr0_read_shadow & 0x1) &&
+ (val & 0x1))
+ return 1;
+ break;
+ }
+ return 0;
+}
+
+/*
+ * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
+ * should handle it ourselves in L0 (and then continue L2). Only call this
+ * when in is_guest_mode (L2).
+ */
+static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
+{
+ u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
+ u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ if (vmx->nested.nested_run_pending)
+ return 0;
+
+ if (unlikely(vmx->fail)) {
+ printk(KERN_INFO "%s failed vm entry %x\n",
+ __func__, vmcs_read32(VM_INSTRUCTION_ERROR));
+ return 1;
+ }
+
+ switch (exit_reason) {
+ case EXIT_REASON_EXCEPTION_NMI:
+ if (!is_exception(intr_info))
+ return 0;
+ else if (is_page_fault(intr_info))
+ return enable_ept;
+ return vmcs12->exception_bitmap &
+ (1u << (intr_info & INTR_INFO_VECTOR_MASK));
+ case EXIT_REASON_EXTERNAL_INTERRUPT:
+ return 0;
+ case EXIT_REASON_TRIPLE_FAULT:
+ return 1;
+ case EXIT_REASON_PENDING_INTERRUPT:
+ case EXIT_REASON_NMI_WINDOW:
+ /*
+ * prepare_vmcs02() set the CPU_BASED_VIRTUAL_INTR_PENDING bit
+ * (aka Interrupt Window Exiting) only when L1 turned it on,
+ * so if we got a PENDING_INTERRUPT exit, this must be for L1.
+ * Same for NMI Window Exiting.
+ */
+ return 1;
+ case EXIT_REASON_TASK_SWITCH:
+ return 1;
+ case EXIT_REASON_CPUID:
+ return 1;
+ case EXIT_REASON_HLT:
+ return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
+ case EXIT_REASON_INVD:
+ return 1;
+ case EXIT_REASON_INVLPG:
+ return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
+ case EXIT_REASON_RDPMC:
+ return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
+ case EXIT_REASON_RDTSC:
+ return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
+ case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
+ case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
+ case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
+ case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
+ case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
+ /*
+ * VMX instructions trap unconditionally. This allows L1 to
+ * emulate them for its L2 guest, i.e., allows 3-level nesting!
+ */
+ return 1;
+ case EXIT_REASON_CR_ACCESS:
+ return nested_vmx_exit_handled_cr(vcpu, vmcs12);
+ case EXIT_REASON_DR_ACCESS:
+ return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
+ case EXIT_REASON_IO_INSTRUCTION:
+ /* TODO: support IO bitmaps */
+ return 1;
+ case EXIT_REASON_MSR_READ:
+ case EXIT_REASON_MSR_WRITE:
+ return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
+ case EXIT_REASON_INVALID_STATE:
+ return 1;
+ case EXIT_REASON_MWAIT_INSTRUCTION:
+ return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
+ case EXIT_REASON_MONITOR_INSTRUCTION:
+ return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
+ case EXIT_REASON_PAUSE_INSTRUCTION:
+ return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
+ nested_cpu_has2(vmcs12,
+ SECONDARY_EXEC_PAUSE_LOOP_EXITING);
+ case EXIT_REASON_MCE_DURING_VMENTRY:
+ return 0;
+ case EXIT_REASON_TPR_BELOW_THRESHOLD:
+ return 1;
+ case EXIT_REASON_APIC_ACCESS:
+ return nested_cpu_has2(vmcs12,
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+ case EXIT_REASON_EPT_VIOLATION:
+ case EXIT_REASON_EPT_MISCONFIG:
+ return 0;
+ case EXIT_REASON_WBINVD:
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
+ case EXIT_REASON_XSETBV:
+ return 1;
+ default:
+ return 1;
+ }
+}
+
static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
{
*info1 = vmcs_readl(EXIT_QUALIFICATION);
@@ -3805,6 +5702,25 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
if (vmx->emulation_required && emulate_invalid_guest_state)
return handle_invalid_guest_state(vcpu);
+ /*
+ * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
+ * we did not inject a still-pending event to L1 now because of
+ * nested_run_pending, we need to re-enable this bit.
+ */
+ if (vmx->nested.nested_run_pending)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ if (!is_guest_mode(vcpu) && (exit_reason == EXIT_REASON_VMLAUNCH ||
+ exit_reason == EXIT_REASON_VMRESUME))
+ vmx->nested.nested_run_pending = 1;
+ else
+ vmx->nested.nested_run_pending = 0;
+
+ if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
+ nested_vmx_vmexit(vcpu);
+ return 1;
+ }
+
if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason
@@ -3827,7 +5743,9 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
"(0x%x) and exit reason is 0x%x\n",
__func__, vectoring_info, exit_reason);
- if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
+ if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
+ !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
+ get_vmcs12(vcpu), vcpu)))) {
if (vmx_interrupt_allowed(vcpu)) {
vmx->soft_vnmi_blocked = 0;
} else if (vmx->vnmi_blocked_time > 1000000000LL &&
@@ -3867,12 +5785,17 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
{
- u32 exit_intr_info = vmx->exit_intr_info;
+ u32 exit_intr_info;
+
+ if (!(vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY
+ || vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI))
+ return;
+
+ vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ exit_intr_info = vmx->exit_intr_info;
/* Handle machine checks before interrupts are enabled */
- if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
- || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
- && is_machine_check(exit_intr_info)))
+ if (is_machine_check(exit_intr_info))
kvm_machine_check();
/* We need to handle NMIs before interrupts are enabled */
@@ -3886,7 +5809,7 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
{
- u32 exit_intr_info = vmx->exit_intr_info;
+ u32 exit_intr_info;
bool unblock_nmi;
u8 vector;
bool idtv_info_valid;
@@ -3894,6 +5817,13 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
if (cpu_has_virtual_nmis()) {
+ if (vmx->nmi_known_unmasked)
+ return;
+ /*
+ * Can't use vmx->exit_intr_info since we're not sure what
+ * the exit reason is.
+ */
+ exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
/*
@@ -3910,6 +5840,10 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
vector != DF_VECTOR && !idtv_info_valid)
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
GUEST_INTR_STATE_NMI);
+ else
+ vmx->nmi_known_unmasked =
+ !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
+ & GUEST_INTR_STATE_NMI);
} else if (unlikely(vmx->soft_vnmi_blocked))
vmx->vnmi_blocked_time +=
ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
@@ -3946,8 +5880,7 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
* Clear bit "block by NMI" before VM entry if a NMI
* delivery faulted.
*/
- vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
- GUEST_INTR_STATE_NMI);
+ vmx_set_nmi_mask(&vmx->vcpu, false);
break;
case INTR_TYPE_SOFT_EXCEPTION:
vmx->vcpu.arch.event_exit_inst_len =
@@ -3975,6 +5908,8 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
{
+ if (is_guest_mode(&vmx->vcpu))
+ return;
__vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
VM_EXIT_INSTRUCTION_LEN,
IDT_VECTORING_ERROR_CODE);
@@ -3982,6 +5917,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
{
+ if (is_guest_mode(vcpu))
+ return;
__vmx_complete_interrupts(to_vmx(vcpu),
vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
VM_ENTRY_INSTRUCTION_LEN,
@@ -4002,6 +5939,21 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ if (vmcs12->idt_vectoring_info_field &
+ VECTORING_INFO_VALID_MASK) {
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+ vmcs12->idt_vectoring_info_field);
+ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+ vmcs12->vm_exit_instruction_len);
+ if (vmcs12->idt_vectoring_info_field &
+ VECTORING_INFO_DELIVER_CODE_MASK)
+ vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+ vmcs12->idt_vectoring_error_code);
+ }
+ }
+
/* Record the guest's net vcpu time for enforced NMI injections. */
if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
vmx->entry_time = ktime_get();
@@ -4024,6 +5976,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vmx_set_interrupt_shadow(vcpu, 0);
+ vmx->__launched = vmx->loaded_vmcs->launched;
asm(
/* Store host registers */
"push %%"R"dx; push %%"R"bp;"
@@ -4094,7 +6047,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
"pop %%"R"bp; pop %%"R"dx \n\t"
"setbe %c[fail](%0) \n\t"
: : "c"(vmx), "d"((unsigned long)HOST_RSP),
- [launched]"i"(offsetof(struct vcpu_vmx, launched)),
+ [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
[fail]"i"(offsetof(struct vcpu_vmx, fail)),
[host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
[rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
@@ -4124,17 +6077,30 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
);
vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
+ | (1 << VCPU_EXREG_RFLAGS)
+ | (1 << VCPU_EXREG_CPL)
| (1 << VCPU_EXREG_PDPTR)
+ | (1 << VCPU_EXREG_SEGMENTS)
| (1 << VCPU_EXREG_CR3));
vcpu->arch.regs_dirty = 0;
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+ if (is_guest_mode(vcpu)) {
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ vmcs12->idt_vectoring_info_field = vmx->idt_vectoring_info;
+ if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
+ vmcs12->idt_vectoring_error_code =
+ vmcs_read32(IDT_VECTORING_ERROR_CODE);
+ vmcs12->vm_exit_instruction_len =
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ }
+ }
+
asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
- vmx->launched = 1;
+ vmx->loaded_vmcs->launched = 1;
vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
- vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
vmx_complete_atomic_exit(vmx);
vmx_recover_nmi_blocking(vmx);
@@ -4144,41 +6110,18 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
#undef R
#undef Q
-static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- if (vmx->vmcs) {
- vcpu_clear(vmx);
- free_vmcs(vmx->vmcs);
- vmx->vmcs = NULL;
- }
-}
-
static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
free_vpid(vmx);
- vmx_free_vmcs(vcpu);
+ free_nested(vmx);
+ free_loaded_vmcs(vmx->loaded_vmcs);
kfree(vmx->guest_msrs);
kvm_vcpu_uninit(vcpu);
kmem_cache_free(kvm_vcpu_cache, vmx);
}
-static inline void vmcs_init(struct vmcs *vmcs)
-{
- u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id()));
-
- if (!vmm_exclusive)
- kvm_cpu_vmxon(phys_addr);
-
- vmcs_clear(vmcs);
-
- if (!vmm_exclusive)
- kvm_cpu_vmxoff();
-}
-
static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
{
int err;
@@ -4195,16 +6138,20 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
goto free_vcpu;
vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ err = -ENOMEM;
if (!vmx->guest_msrs) {
- err = -ENOMEM;
goto uninit_vcpu;
}
- vmx->vmcs = alloc_vmcs();
- if (!vmx->vmcs)
+ vmx->loaded_vmcs = &vmx->vmcs01;
+ vmx->loaded_vmcs->vmcs = alloc_vmcs();
+ if (!vmx->loaded_vmcs->vmcs)
goto free_msrs;
-
- vmcs_init(vmx->vmcs);
+ if (!vmm_exclusive)
+ kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id())));
+ loaded_vmcs_init(vmx->loaded_vmcs);
+ if (!vmm_exclusive)
+ kvm_cpu_vmxoff();
cpu = get_cpu();
vmx_vcpu_load(&vmx->vcpu, cpu);
@@ -4215,7 +6162,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
if (err)
goto free_vmcs;
if (vm_need_virtualize_apic_accesses(kvm))
- if (alloc_apic_access_page(kvm) != 0)
+ err = alloc_apic_access_page(kvm);
+ if (err)
goto free_vmcs;
if (enable_ept) {
@@ -4229,10 +6177,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
goto free_vmcs;
}
+ vmx->nested.current_vmptr = -1ull;
+ vmx->nested.current_vmcs12 = NULL;
+
return &vmx->vcpu;
free_vmcs:
- free_vmcs(vmx->vmcs);
+ free_vmcs(vmx->loaded_vmcs->vmcs);
free_msrs:
kfree(vmx->guest_msrs);
uninit_vcpu:
@@ -4366,6 +6317,657 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
{
+ if (func == 1 && nested)
+ entry->ecx |= bit(X86_FEATURE_VMX);
+}
+
+/*
+ * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
+ * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
+ * with L0's requirements for its guest (a.k.a. vmsc01), so we can run the L2
+ * guest in a way that will both be appropriate to L1's requests, and our
+ * needs. In addition to modifying the active vmcs (which is vmcs02), this
+ * function also has additional necessary side-effects, like setting various
+ * vcpu->arch fields.
+ */
+static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 exec_control;
+
+ vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
+ vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
+ vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
+ vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
+ vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
+ vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
+ vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
+ vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
+ vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
+ vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
+ vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
+ vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
+ vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
+ vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
+ vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
+ vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
+ vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
+ vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
+ vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
+ vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
+ vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
+ vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
+ vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
+ vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
+ vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
+ vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
+ vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
+ vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
+ vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
+ vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
+ vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
+ vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
+ vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
+ vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
+ vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
+ vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
+
+ vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+ vmcs12->vm_entry_intr_info_field);
+ vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+ vmcs12->vm_entry_exception_error_code);
+ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+ vmcs12->vm_entry_instruction_len);
+ vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
+ vmcs12->guest_interruptibility_info);
+ vmcs_write32(GUEST_ACTIVITY_STATE, vmcs12->guest_activity_state);
+ vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
+ vmcs_writel(GUEST_DR7, vmcs12->guest_dr7);
+ vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags);
+ vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+ vmcs12->guest_pending_dbg_exceptions);
+ vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
+ vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
+
+ vmcs_write64(VMCS_LINK_POINTER, -1ull);
+
+ vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
+ (vmcs_config.pin_based_exec_ctrl |
+ vmcs12->pin_based_vm_exec_control));
+
+ /*
+ * Whether page-faults are trapped is determined by a combination of
+ * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
+ * If enable_ept, L0 doesn't care about page faults and we should
+ * set all of these to L1's desires. However, if !enable_ept, L0 does
+ * care about (at least some) page faults, and because it is not easy
+ * (if at all possible?) to merge L0 and L1's desires, we simply ask
+ * to exit on each and every L2 page fault. This is done by setting
+ * MASK=MATCH=0 and (see below) EB.PF=1.
+ * Note that below we don't need special code to set EB.PF beyond the
+ * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
+ * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
+ * !enable_ept, EB.PF is 1, so the "or" will always be 1.
+ *
+ * A problem with this approach (when !enable_ept) is that L1 may be
+ * injected with more page faults than it asked for. This could have
+ * caused problems, but in practice existing hypervisors don't care.
+ * To fix this, we will need to emulate the PFEC checking (on the L1
+ * page tables), using walk_addr(), when injecting PFs to L1.
+ */
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
+ enable_ept ? vmcs12->page_fault_error_code_mask : 0);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
+ enable_ept ? vmcs12->page_fault_error_code_match : 0);
+
+ if (cpu_has_secondary_exec_ctrls()) {
+ u32 exec_control = vmx_secondary_exec_control(vmx);
+ if (!vmx->rdtscp_enabled)
+ exec_control &= ~SECONDARY_EXEC_RDTSCP;
+ /* Take the following fields only from vmcs12 */
+ exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ if (nested_cpu_has(vmcs12,
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
+ exec_control |= vmcs12->secondary_vm_exec_control;
+
+ if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) {
+ /*
+ * Translate L1 physical address to host physical
+ * address for vmcs02. Keep the page pinned, so this
+ * physical address remains valid. We keep a reference
+ * to it so we can release it later.
+ */
+ if (vmx->nested.apic_access_page) /* shouldn't happen */
+ nested_release_page(vmx->nested.apic_access_page);
+ vmx->nested.apic_access_page =
+ nested_get_page(vcpu, vmcs12->apic_access_addr);
+ /*
+ * If translation failed, no matter: This feature asks
+ * to exit when accessing the given address, and if it
+ * can never be accessed, this feature won't do
+ * anything anyway.
+ */
+ if (!vmx->nested.apic_access_page)
+ exec_control &=
+ ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ else
+ vmcs_write64(APIC_ACCESS_ADDR,
+ page_to_phys(vmx->nested.apic_access_page));
+ }
+
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+ }
+
+
+ /*
+ * Set host-state according to L0's settings (vmcs12 is irrelevant here)
+ * Some constant fields are set here by vmx_set_constant_host_state().
+ * Other fields are different per CPU, and will be set later when
+ * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
+ */
+ vmx_set_constant_host_state();
+
+ /*
+ * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
+ * entry, but only if the current (host) sp changed from the value
+ * we wrote last (vmx->host_rsp). This cache is no longer relevant
+ * if we switch vmcs, and rather than hold a separate cache per vmcs,
+ * here we just force the write to happen on entry.
+ */
+ vmx->host_rsp = 0;
+
+ exec_control = vmx_exec_control(vmx); /* L0's desires */
+ exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
+ exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
+ exec_control &= ~CPU_BASED_TPR_SHADOW;
+ exec_control |= vmcs12->cpu_based_vm_exec_control;
+ /*
+ * Merging of IO and MSR bitmaps not currently supported.
+ * Rather, exit every time.
+ */
+ exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
+ exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
+ exec_control |= CPU_BASED_UNCOND_IO_EXITING;
+
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
+
+ /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
+ * bitwise-or of what L1 wants to trap for L2, and what we want to
+ * trap. Note that CR0.TS also needs updating - we do this later.
+ */
+ update_exception_bitmap(vcpu);
+ vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
+ vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
+
+ /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */
+ vmcs_write32(VM_EXIT_CONTROLS,
+ vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl);
+ vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls |
+ (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
+
+ if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)
+ vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
+ else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+ vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
+
+
+ set_cr4_guest_host_mask(vmx);
+
+ vmcs_write64(TSC_OFFSET,
+ vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
+
+ if (enable_vpid) {
+ /*
+ * Trivially support vpid by letting L2s share their parent
+ * L1's vpid. TODO: move to a more elaborate solution, giving
+ * each L2 its own vpid and exposing the vpid feature to L1.
+ */
+ vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
+ vmx_flush_tlb(vcpu);
+ }
+
+ if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
+ vcpu->arch.efer = vmcs12->guest_ia32_efer;
+ if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
+ vcpu->arch.efer |= (EFER_LMA | EFER_LME);
+ else
+ vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
+ /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
+ vmx_set_efer(vcpu, vcpu->arch.efer);
+
+ /*
+ * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified
+ * TS bit (for lazy fpu) and bits which we consider mandatory enabled.
+ * The CR0_READ_SHADOW is what L2 should have expected to read given
+ * the specifications by L1; It's not enough to take
+ * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
+ * have more bits than L1 expected.
+ */
+ vmx_set_cr0(vcpu, vmcs12->guest_cr0);
+ vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
+
+ vmx_set_cr4(vcpu, vmcs12->guest_cr4);
+ vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
+
+ /* shadow page tables on either EPT or shadow page tables */
+ kvm_set_cr3(vcpu, vmcs12->guest_cr3);
+ kvm_mmu_reset_context(vcpu);
+
+ kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
+ kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
+}
+
+/*
+ * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
+ * for running an L2 nested guest.
+ */
+static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
+{
+ struct vmcs12 *vmcs12;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int cpu;
+ struct loaded_vmcs *vmcs02;
+
+ if (!nested_vmx_check_permission(vcpu) ||
+ !nested_vmx_check_vmcs12(vcpu))
+ return 1;
+
+ skip_emulated_instruction(vcpu);
+ vmcs12 = get_vmcs12(vcpu);
+
+ /*
+ * The nested entry process starts with enforcing various prerequisites
+ * on vmcs12 as required by the Intel SDM, and act appropriately when
+ * they fail: As the SDM explains, some conditions should cause the
+ * instruction to fail, while others will cause the instruction to seem
+ * to succeed, but return an EXIT_REASON_INVALID_STATE.
+ * To speed up the normal (success) code path, we should avoid checking
+ * for misconfigurations which will anyway be caught by the processor
+ * when using the merged vmcs02.
+ */
+ if (vmcs12->launch_state == launch) {
+ nested_vmx_failValid(vcpu,
+ launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
+ : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
+ return 1;
+ }
+
+ if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) &&
+ !IS_ALIGNED(vmcs12->msr_bitmap, PAGE_SIZE)) {
+ /*TODO: Also verify bits beyond physical address width are 0*/
+ nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+ return 1;
+ }
+
+ if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
+ !IS_ALIGNED(vmcs12->apic_access_addr, PAGE_SIZE)) {
+ /*TODO: Also verify bits beyond physical address width are 0*/
+ nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+ return 1;
+ }
+
+ if (vmcs12->vm_entry_msr_load_count > 0 ||
+ vmcs12->vm_exit_msr_load_count > 0 ||
+ vmcs12->vm_exit_msr_store_count > 0) {
+ if (printk_ratelimit())
+ printk(KERN_WARNING
+ "%s: VMCS MSR_{LOAD,STORE} unsupported\n", __func__);
+ nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+ return 1;
+ }
+
+ if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
+ nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high) ||
+ !vmx_control_verify(vmcs12->secondary_vm_exec_control,
+ nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) ||
+ !vmx_control_verify(vmcs12->pin_based_vm_exec_control,
+ nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) ||
+ !vmx_control_verify(vmcs12->vm_exit_controls,
+ nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high) ||
+ !vmx_control_verify(vmcs12->vm_entry_controls,
+ nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high))
+ {
+ nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+ return 1;
+ }
+
+ if (((vmcs12->host_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) ||
+ ((vmcs12->host_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
+ nested_vmx_failValid(vcpu,
+ VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
+ return 1;
+ }
+
+ if (((vmcs12->guest_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) ||
+ ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
+ nested_vmx_entry_failure(vcpu, vmcs12,
+ EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
+ return 1;
+ }
+ if (vmcs12->vmcs_link_pointer != -1ull) {
+ nested_vmx_entry_failure(vcpu, vmcs12,
+ EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR);
+ return 1;
+ }
+
+ /*
+ * We're finally done with prerequisite checking, and can start with
+ * the nested entry.
+ */
+
+ vmcs02 = nested_get_current_vmcs02(vmx);
+ if (!vmcs02)
+ return -ENOMEM;
+
+ enter_guest_mode(vcpu);
+
+ vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET);
+
+ cpu = get_cpu();
+ vmx->loaded_vmcs = vmcs02;
+ vmx_vcpu_put(vcpu);
+ vmx_vcpu_load(vcpu, cpu);
+ vcpu->cpu = cpu;
+ put_cpu();
+
+ vmcs12->launch_state = 1;
+
+ prepare_vmcs02(vcpu, vmcs12);
+
+ /*
+ * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
+ * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
+ * returned as far as L1 is concerned. It will only return (and set
+ * the success flag) when L2 exits (see nested_vmx_vmexit()).
+ */
+ return 1;
+}
+
+/*
+ * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
+ * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
+ * This function returns the new value we should put in vmcs12.guest_cr0.
+ * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
+ * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
+ * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
+ * didn't trap the bit, because if L1 did, so would L0).
+ * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have
+ * been modified by L2, and L1 knows it. So just leave the old value of
+ * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
+ * isn't relevant, because if L0 traps this bit it can set it to anything.
+ * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
+ * changed these bits, and therefore they need to be updated, but L0
+ * didn't necessarily allow them to be changed in GUEST_CR0 - and rather
+ * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
+ */
+static inline unsigned long
+vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+ return
+ /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
+ /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
+ /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
+ vcpu->arch.cr0_guest_owned_bits));
+}
+
+static inline unsigned long
+vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+ return
+ /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
+ /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
+ /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
+ vcpu->arch.cr4_guest_owned_bits));
+}
+
+/*
+ * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
+ * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
+ * and this function updates it to reflect the changes to the guest state while
+ * L2 was running (and perhaps made some exits which were handled directly by L0
+ * without going back to L1), and to reflect the exit reason.
+ * Note that we do not have to copy here all VMCS fields, just those that
+ * could have changed by the L2 guest or the exit - i.e., the guest-state and
+ * exit-information fields only. Other fields are modified by L1 with VMWRITE,
+ * which already writes to vmcs12 directly.
+ */
+void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+ /* update guest state fields: */
+ vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
+ vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
+
+ kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
+ vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+ vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP);
+ vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
+
+ vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
+ vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
+ vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
+ vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
+ vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
+ vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
+ vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
+ vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
+ vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
+ vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
+ vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
+ vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
+ vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
+ vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
+ vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
+ vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
+ vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
+ vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
+ vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
+ vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
+ vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
+ vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
+ vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
+ vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
+ vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
+ vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
+ vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
+ vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
+ vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
+ vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
+ vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
+ vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
+ vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
+ vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
+ vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
+ vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
+
+ vmcs12->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE);
+ vmcs12->guest_interruptibility_info =
+ vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+ vmcs12->guest_pending_dbg_exceptions =
+ vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+
+ /* TODO: These cannot have changed unless we have MSR bitmaps and
+ * the relevant bit asks not to trap the change */
+ vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ if (vmcs12->vm_entry_controls & VM_EXIT_SAVE_IA32_PAT)
+ vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
+ vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
+ vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
+ vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
+
+ /* update exit information fields: */
+
+ vmcs12->vm_exit_reason = vmcs_read32(VM_EXIT_REASON);
+ vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+ vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+ vmcs12->idt_vectoring_info_field =
+ vmcs_read32(IDT_VECTORING_INFO_FIELD);
+ vmcs12->idt_vectoring_error_code =
+ vmcs_read32(IDT_VECTORING_ERROR_CODE);
+ vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+
+ /* clear vm-entry fields which are to be cleared on exit */
+ if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
+ vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
+}
+
+/*
+ * A part of what we need to when the nested L2 guest exits and we want to
+ * run its L1 parent, is to reset L1's guest state to the host state specified
+ * in vmcs12.
+ * This function is to be called not only on normal nested exit, but also on
+ * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
+ * Failures During or After Loading Guest State").
+ * This function should be called when the active VMCS is L1's (vmcs01).
+ */
+void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
+ vcpu->arch.efer = vmcs12->host_ia32_efer;
+ if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
+ vcpu->arch.efer |= (EFER_LMA | EFER_LME);
+ else
+ vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
+ vmx_set_efer(vcpu, vcpu->arch.efer);
+
+ kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
+ kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
+ /*
+ * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
+ * actually changed, because it depends on the current state of
+ * fpu_active (which may have changed).
+ * Note that vmx_set_cr0 refers to efer set above.
+ */
+ kvm_set_cr0(vcpu, vmcs12->host_cr0);
+ /*
+ * If we did fpu_activate()/fpu_deactivate() during L2's run, we need
+ * to apply the same changes to L1's vmcs. We just set cr0 correctly,
+ * but we also need to update cr0_guest_host_mask and exception_bitmap.
+ */
+ update_exception_bitmap(vcpu);
+ vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0);
+ vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
+
+ /*
+ * Note that CR4_GUEST_HOST_MASK is already set in the original vmcs01
+ * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask();
+ */
+ vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
+ kvm_set_cr4(vcpu, vmcs12->host_cr4);
+
+ /* shadow page tables on either EPT or shadow page tables */
+ kvm_set_cr3(vcpu, vmcs12->host_cr3);
+ kvm_mmu_reset_context(vcpu);
+
+ if (enable_vpid) {
+ /*
+ * Trivially support vpid by letting L2s share their parent
+ * L1's vpid. TODO: move to a more elaborate solution, giving
+ * each L2 its own vpid and exposing the vpid feature to L1.
+ */
+ vmx_flush_tlb(vcpu);
+ }
+
+
+ vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
+ vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
+ vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
+ vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
+ vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
+ vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base);
+ vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base);
+ vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base);
+ vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector);
+ vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector);
+ vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector);
+ vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector);
+ vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector);
+ vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector);
+ vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector);
+
+ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT)
+ vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
+ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+ vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
+ vmcs12->host_ia32_perf_global_ctrl);
+}
+
+/*
+ * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
+ * and modify vmcs12 to make it see what it would expect to see there if
+ * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
+ */
+static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int cpu;
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ leave_guest_mode(vcpu);
+ prepare_vmcs12(vcpu, vmcs12);
+
+ cpu = get_cpu();
+ vmx->loaded_vmcs = &vmx->vmcs01;
+ vmx_vcpu_put(vcpu);
+ vmx_vcpu_load(vcpu, cpu);
+ vcpu->cpu = cpu;
+ put_cpu();
+
+ /* if no vmcs02 cache requested, remove the one we used */
+ if (VMCS02_POOL_SIZE == 0)
+ nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
+
+ load_vmcs12_host_state(vcpu, vmcs12);
+
+ /* Update TSC_OFFSET if vmx_adjust_tsc_offset() was used while L2 ran */
+ vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
+
+ /* This is needed for same reason as it was needed in prepare_vmcs02 */
+ vmx->host_rsp = 0;
+
+ /* Unpin physical memory we referred to in vmcs02 */
+ if (vmx->nested.apic_access_page) {
+ nested_release_page(vmx->nested.apic_access_page);
+ vmx->nested.apic_access_page = 0;
+ }
+
+ /*
+ * Exiting from L2 to L1, we're now back to L1 which thinks it just
+ * finished a VMLAUNCH or VMRESUME instruction, so we need to set the
+ * success or failure flag accordingly.
+ */
+ if (unlikely(vmx->fail)) {
+ vmx->fail = 0;
+ nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR));
+ } else
+ nested_vmx_succeed(vcpu);
+}
+
+/*
+ * L1's failure to enter L2 is a subset of a normal exit, as explained in
+ * 23.7 "VM-entry failures during or after loading guest state" (this also
+ * lists the acceptable exit-reason and exit-qualification parameters).
+ * It should only be called before L2 actually succeeded to run, and when
+ * vmcs01 is current (it doesn't leave_guest_mode() or switch vmcss).
+ */
+static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12,
+ u32 reason, unsigned long qualification)
+{
+ load_vmcs12_host_state(vcpu, vmcs12);
+ vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
+ vmcs12->exit_qualification = qualification;
+ nested_vmx_succeed(vcpu);
+}
+
+static int vmx_check_intercept(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage)
+{
+ return X86EMUL_CONTINUE;
}
static struct kvm_x86_ops vmx_x86_ops = {
@@ -4449,10 +7051,14 @@ static struct kvm_x86_ops vmx_x86_ops = {
.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
+ .set_tsc_khz = vmx_set_tsc_khz,
.write_tsc_offset = vmx_write_tsc_offset,
.adjust_tsc_offset = vmx_adjust_tsc_offset,
+ .compute_tsc_offset = vmx_compute_tsc_offset,
.set_tdp_cr3 = vmx_set_cr3,
+
+ .check_intercept = vmx_check_intercept,
};
static int __init vmx_init(void)
@@ -4513,16 +7119,13 @@ static int __init vmx_init(void)
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
if (enable_ept) {
- bypass_guest_pf = 0;
kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
VMX_EPT_EXECUTABLE_MASK);
+ ept_set_mmio_spte_mask();
kvm_enable_tdp();
} else
kvm_disable_tdp();
- if (bypass_guest_pf)
- kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
-
return 0;
out3:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 934b4c6b0bf..84a28ea45fa 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -60,22 +60,12 @@
#include <asm/div64.h>
#define MAX_IO_MSRS 256
-#define CR0_RESERVED_BITS \
- (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
- | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
- | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
-#define CR4_RESERVED_BITS \
- (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
- | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
- | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
- | X86_CR4_OSXSAVE \
- | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
-
-#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
-
#define KVM_MAX_MCE_BANKS 32
#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
+#define emul_to_vcpu(ctxt) \
+ container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
+
/* EFER defaults:
* - enable syscall per default because its emulated by KVM
* - enable LME and LMA per default on 64 bit KVM
@@ -100,6 +90,11 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
int ignore_msrs = 0;
module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
+bool kvm_has_tsc_control;
+EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
+u32 kvm_max_guest_tsc_khz;
+EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
+
#define KVM_NR_SHARED_MSRS 16
struct kvm_shared_msrs_global {
@@ -157,6 +152,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
u64 __read_mostly host_xcr0;
+int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
+
static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
{
int i;
@@ -350,6 +347,7 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
vcpu->arch.cr2 = fault->address;
kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
}
+EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
{
@@ -361,8 +359,8 @@ void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
void kvm_inject_nmi(struct kvm_vcpu *vcpu)
{
- kvm_make_request(KVM_REQ_NMI, vcpu);
kvm_make_request(KVM_REQ_EVENT, vcpu);
+ vcpu->arch.nmi_pending = 1;
}
EXPORT_SYMBOL_GPL(kvm_inject_nmi);
@@ -582,6 +580,22 @@ static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
return best && (best->ecx & bit(X86_FEATURE_XSAVE));
}
+static bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 7, 0);
+ return best && (best->ebx & bit(X86_FEATURE_SMEP));
+}
+
+static bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 7, 0);
+ return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
+}
+
static void update_cpuid(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
@@ -601,14 +615,20 @@ static void update_cpuid(struct kvm_vcpu *vcpu)
int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
unsigned long old_cr4 = kvm_read_cr4(vcpu);
- unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
-
+ unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE |
+ X86_CR4_PAE | X86_CR4_SMEP;
if (cr4 & CR4_RESERVED_BITS)
return 1;
if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
return 1;
+ if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP))
+ return 1;
+
+ if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_RDWRGSFS))
+ return 1;
+
if (is_long_mode(vcpu)) {
if (!(cr4 & X86_CR4_PAE))
return 1;
@@ -618,11 +638,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
kvm_read_cr3(vcpu)))
return 1;
- if (cr4 & X86_CR4_VMXE)
+ if (kvm_x86_ops->set_cr4(vcpu, cr4))
return 1;
- kvm_x86_ops->set_cr4(vcpu, cr4);
-
if ((cr4 ^ old_cr4) & pdptr_bits)
kvm_mmu_reset_context(vcpu);
@@ -790,12 +808,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
* kvm-specific. Those are put in the beginning of the list.
*/
-#define KVM_SAVE_MSRS_BEGIN 8
+#define KVM_SAVE_MSRS_BEGIN 9
static u32 msrs_to_save[] = {
MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
- HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
+ HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
MSR_STAR,
#ifdef CONFIG_X86_64
@@ -982,7 +1000,15 @@ static inline int kvm_tsc_changes_freq(void)
return ret;
}
-static inline u64 nsec_to_cycles(u64 nsec)
+static u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.virtual_tsc_khz)
+ return vcpu->arch.virtual_tsc_khz;
+ else
+ return __this_cpu_read(cpu_tsc_khz);
+}
+
+static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
{
u64 ret;
@@ -990,25 +1016,24 @@ static inline u64 nsec_to_cycles(u64 nsec)
if (kvm_tsc_changes_freq())
printk_once(KERN_WARNING
"kvm: unreliable cycle conversion on adjustable rate TSC\n");
- ret = nsec * __this_cpu_read(cpu_tsc_khz);
+ ret = nsec * vcpu_tsc_khz(vcpu);
do_div(ret, USEC_PER_SEC);
return ret;
}
-static void kvm_arch_set_tsc_khz(struct kvm *kvm, u32 this_tsc_khz)
+static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
{
/* Compute a scale to convert nanoseconds in TSC cycles */
kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
- &kvm->arch.virtual_tsc_shift,
- &kvm->arch.virtual_tsc_mult);
- kvm->arch.virtual_tsc_khz = this_tsc_khz;
+ &vcpu->arch.tsc_catchup_shift,
+ &vcpu->arch.tsc_catchup_mult);
}
static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
{
u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec,
- vcpu->kvm->arch.virtual_tsc_mult,
- vcpu->kvm->arch.virtual_tsc_shift);
+ vcpu->arch.tsc_catchup_mult,
+ vcpu->arch.tsc_catchup_shift);
tsc += vcpu->arch.last_tsc_write;
return tsc;
}
@@ -1021,7 +1046,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
s64 sdiff;
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
- offset = data - native_read_tsc();
+ offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
ns = get_kernel_ns();
elapsed = ns - kvm->arch.last_tsc_nsec;
sdiff = data - kvm->arch.last_tsc_write;
@@ -1037,13 +1062,13 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
* In that case, for a reliable TSC, we can match TSC offsets,
* or make a best guest using elapsed value.
*/
- if (sdiff < nsec_to_cycles(5ULL * NSEC_PER_SEC) &&
+ if (sdiff < nsec_to_cycles(vcpu, 5ULL * NSEC_PER_SEC) &&
elapsed < 5ULL * NSEC_PER_SEC) {
if (!check_tsc_unstable()) {
offset = kvm->arch.last_tsc_offset;
pr_debug("kvm: matched tsc offset for %llu\n", data);
} else {
- u64 delta = nsec_to_cycles(elapsed);
+ u64 delta = nsec_to_cycles(vcpu, elapsed);
offset += delta;
pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
}
@@ -1075,8 +1100,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
local_irq_save(flags);
kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
kernel_ns = get_kernel_ns();
- this_tsc_khz = __this_cpu_read(cpu_tsc_khz);
-
+ this_tsc_khz = vcpu_tsc_khz(v);
if (unlikely(this_tsc_khz == 0)) {
local_irq_restore(flags);
kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
@@ -1385,7 +1409,7 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
return 1;
kvm_x86_ops->patch_hypercall(vcpu, instructions);
((unsigned char *)instructions)[3] = 0xc3; /* ret */
- if (copy_to_user((void __user *)addr, instructions, 4))
+ if (__copy_to_user((void __user *)addr, instructions, 4))
return 1;
kvm->arch.hv_hypercall = data;
break;
@@ -1412,7 +1436,7 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);
if (kvm_is_error_hva(addr))
return 1;
- if (clear_user((void __user *)addr, PAGE_SIZE))
+ if (__clear_user((void __user *)addr, PAGE_SIZE))
return 1;
vcpu->arch.hv_vapic = data;
break;
@@ -1464,6 +1488,35 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu)
}
}
+static void accumulate_steal_time(struct kvm_vcpu *vcpu)
+{
+ u64 delta;
+
+ if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+ return;
+
+ delta = current->sched_info.run_delay - vcpu->arch.st.last_steal;
+ vcpu->arch.st.last_steal = current->sched_info.run_delay;
+ vcpu->arch.st.accum_steal = delta;
+}
+
+static void record_steal_time(struct kvm_vcpu *vcpu)
+{
+ if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+ return;
+
+ if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+ &vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
+ return;
+
+ vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal;
+ vcpu->arch.st.steal.version += 2;
+ vcpu->arch.st.accum_steal = 0;
+
+ kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+ &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
+}
+
int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
switch (msr) {
@@ -1546,6 +1599,33 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
if (kvm_pv_enable_async_pf(vcpu, data))
return 1;
break;
+ case MSR_KVM_STEAL_TIME:
+
+ if (unlikely(!sched_info_on()))
+ return 1;
+
+ if (data & KVM_STEAL_RESERVED_MASK)
+ return 1;
+
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
+ data & KVM_STEAL_VALID_BITS))
+ return 1;
+
+ vcpu->arch.st.msr_val = data;
+
+ if (!(data & KVM_MSR_ENABLED))
+ break;
+
+ vcpu->arch.st.last_steal = current->sched_info.run_delay;
+
+ preempt_disable();
+ accumulate_steal_time(vcpu);
+ preempt_enable();
+
+ kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
+
+ break;
+
case MSR_IA32_MCG_CTL:
case MSR_IA32_MCG_STATUS:
case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
@@ -1831,6 +1911,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_KVM_ASYNC_PF_EN:
data = vcpu->arch.apf.msr_val;
break;
+ case MSR_KVM_STEAL_TIME:
+ data = vcpu->arch.st.msr_val;
+ break;
case MSR_IA32_P5_MC_ADDR:
case MSR_IA32_P5_MC_TYPE:
case MSR_IA32_MCG_CAP:
@@ -1993,6 +2076,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_X86_ROBUST_SINGLESTEP:
case KVM_CAP_XSAVE:
case KVM_CAP_ASYNC_PF:
+ case KVM_CAP_GET_TSC_KHZ:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -2019,6 +2103,9 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_XCRS:
r = cpu_has_xsave;
break;
+ case KVM_CAP_TSC_CONTROL:
+ r = kvm_has_tsc_control;
+ break;
default:
r = 0;
break;
@@ -2120,8 +2207,13 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
kvm_x86_ops->vcpu_load(vcpu, cpu);
if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
/* Make sure TSC doesn't go backwards */
- s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
- native_read_tsc() - vcpu->arch.last_host_tsc;
+ s64 tsc_delta;
+ u64 tsc;
+
+ kvm_get_msr(vcpu, MSR_IA32_TSC, &tsc);
+ tsc_delta = !vcpu->arch.last_guest_tsc ? 0 :
+ tsc - vcpu->arch.last_guest_tsc;
+
if (tsc_delta < 0)
mark_tsc_unstable("KVM discovered backwards TSC");
if (check_tsc_unstable()) {
@@ -2133,13 +2225,16 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
kvm_migrate_timers(vcpu);
vcpu->cpu = cpu;
}
+
+ accumulate_steal_time(vcpu);
+ kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
kvm_x86_ops->vcpu_put(vcpu);
kvm_put_guest_fpu(vcpu);
- vcpu->arch.last_host_tsc = native_read_tsc();
+ kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
}
static int is_efer_nx(void)
@@ -2271,6 +2366,13 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->flags = 0;
}
+static bool supported_xcr0_bit(unsigned bit)
+{
+ u64 mask = ((u64)1 << bit);
+
+ return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0;
+}
+
#define F(x) bit(X86_FEATURE_##x)
static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
@@ -2316,7 +2418,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
0 /* Reserved, DCA */ | F(XMM4_1) |
F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
- F(F16C);
+ F(F16C) | F(RDRAND);
/* cpuid 0x80000001.ecx */
const u32 kvm_supported_word6_x86_features =
F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
@@ -2324,6 +2426,16 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
+ /* cpuid 0xC0000001.edx */
+ const u32 kvm_supported_word5_x86_features =
+ F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
+ F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
+ F(PMM) | F(PMM_EN);
+
+ /* cpuid 7.0.ebx */
+ const u32 kvm_supported_word9_x86_features =
+ F(SMEP) | F(FSGSBASE) | F(ERMS);
+
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
do_cpuid_1_ent(entry, function, index);
@@ -2358,7 +2470,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
}
break;
}
- /* function 4 and 0xb have additional index. */
+ /* function 4 has additional index. */
case 4: {
int i, cache_type;
@@ -2375,6 +2487,22 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
}
break;
}
+ case 7: {
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ /* Mask ebx against host capbability word 9 */
+ if (index == 0) {
+ entry->ebx &= kvm_supported_word9_x86_features;
+ cpuid_mask(&entry->ebx, 9);
+ } else
+ entry->ebx = 0;
+ entry->eax = 0;
+ entry->ecx = 0;
+ entry->edx = 0;
+ break;
+ }
+ case 9:
+ break;
+ /* function 0xb has additional index. */
case 0xb: {
int i, level_type;
@@ -2392,16 +2520,17 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
break;
}
case 0xd: {
- int i;
+ int idx, i;
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
- for (i = 1; *nent < maxnent && i < 64; ++i) {
- if (entry[i].eax == 0)
+ for (idx = 1, i = 1; *nent < maxnent && idx < 64; ++idx) {
+ do_cpuid_1_ent(&entry[i], function, idx);
+ if (entry[i].eax == 0 || !supported_xcr0_bit(idx))
continue;
- do_cpuid_1_ent(&entry[i], function, i);
entry[i].flags |=
KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
++*nent;
+ ++i;
}
break;
}
@@ -2418,7 +2547,12 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
(1 << KVM_FEATURE_NOP_IO_DELAY) |
(1 << KVM_FEATURE_CLOCKSOURCE2) |
+ (1 << KVM_FEATURE_ASYNC_PF) |
(1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
+
+ if (sched_info_on())
+ entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
+
entry->ebx = 0;
entry->ecx = 0;
entry->edx = 0;
@@ -2432,6 +2566,44 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->ecx &= kvm_supported_word6_x86_features;
cpuid_mask(&entry->ecx, 6);
break;
+ case 0x80000008: {
+ unsigned g_phys_as = (entry->eax >> 16) & 0xff;
+ unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
+ unsigned phys_as = entry->eax & 0xff;
+
+ if (!g_phys_as)
+ g_phys_as = phys_as;
+ entry->eax = g_phys_as | (virt_as << 8);
+ entry->ebx = entry->edx = 0;
+ break;
+ }
+ case 0x80000019:
+ entry->ecx = entry->edx = 0;
+ break;
+ case 0x8000001a:
+ break;
+ case 0x8000001d:
+ break;
+ /*Add support for Centaur's CPUID instruction*/
+ case 0xC0000000:
+ /*Just support up to 0xC0000004 now*/
+ entry->eax = min(entry->eax, 0xC0000004);
+ break;
+ case 0xC0000001:
+ entry->edx &= kvm_supported_word5_x86_features;
+ cpuid_mask(&entry->edx, 5);
+ break;
+ case 3: /* Processor serial number */
+ case 5: /* MONITOR/MWAIT */
+ case 6: /* Thermal management */
+ case 0xA: /* Architectural Performance Monitoring */
+ case 0x80000007: /* Advanced power management */
+ case 0xC0000002:
+ case 0xC0000003:
+ case 0xC0000004:
+ default:
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
}
kvm_x86_ops->set_supported_cpuid(function, entry);
@@ -2478,6 +2650,26 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
if (nent >= cpuid->nent)
goto out_free;
+ /* Add support for Centaur's CPUID instruction. */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR) {
+ do_cpuid_ent(&cpuid_entries[nent], 0xC0000000, 0,
+ &nent, cpuid->nent);
+
+ r = -E2BIG;
+ if (nent >= cpuid->nent)
+ goto out_free;
+
+ limit = cpuid_entries[nent - 1].eax;
+ for (func = 0xC0000001;
+ func <= limit && nent < cpuid->nent; ++func)
+ do_cpuid_ent(&cpuid_entries[nent], func, 0,
+ &nent, cpuid->nent);
+
+ r = -E2BIG;
+ if (nent >= cpuid->nent)
+ goto out_free;
+ }
+
do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent,
cpuid->nent);
@@ -3046,6 +3238,32 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
break;
}
+ case KVM_SET_TSC_KHZ: {
+ u32 user_tsc_khz;
+
+ r = -EINVAL;
+ if (!kvm_has_tsc_control)
+ break;
+
+ user_tsc_khz = (u32)arg;
+
+ if (user_tsc_khz >= kvm_max_guest_tsc_khz)
+ goto out;
+
+ kvm_x86_ops->set_tsc_khz(vcpu, user_tsc_khz);
+
+ r = 0;
+ goto out;
+ }
+ case KVM_GET_TSC_KHZ: {
+ r = -EIO;
+ if (check_tsc_unstable())
+ goto out;
+
+ r = vcpu_tsc_khz(vcpu);
+
+ goto out;
+ }
default:
r = -EINVAL;
}
@@ -3595,20 +3813,43 @@ static void kvm_init_msr_list(void)
static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
const void *v)
{
- if (vcpu->arch.apic &&
- !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
- return 0;
+ int handled = 0;
+ int n;
+
+ do {
+ n = min(len, 8);
+ if (!(vcpu->arch.apic &&
+ !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, n, v))
+ && kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
+ break;
+ handled += n;
+ addr += n;
+ len -= n;
+ v += n;
+ } while (len);
- return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
+ return handled;
}
static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
{
- if (vcpu->arch.apic &&
- !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v))
- return 0;
+ int handled = 0;
+ int n;
- return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
+ do {
+ n = min(len, 8);
+ if (!(vcpu->arch.apic &&
+ !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, n, v))
+ && kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
+ break;
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v);
+ handled += n;
+ addr += n;
+ len -= n;
+ v += n;
+ } while (len);
+
+ return handled;
}
static void kvm_set_segment(struct kvm_vcpu *vcpu,
@@ -3703,37 +3944,44 @@ out:
}
/* used for instruction fetching */
-static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu,
+static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
+ gva_t addr, void *val, unsigned int bytes,
struct x86_exception *exception)
{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
access | PFERR_FETCH_MASK,
exception);
}
-static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu,
+int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
+ gva_t addr, void *val, unsigned int bytes,
struct x86_exception *exception)
{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
exception);
}
+EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
-static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu,
+static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
+ gva_t addr, void *val, unsigned int bytes,
struct x86_exception *exception)
{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
}
-static int kvm_write_guest_virt_system(gva_t addr, void *val,
+int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
+ gva_t addr, void *val,
unsigned int bytes,
- struct kvm_vcpu *vcpu,
struct x86_exception *exception)
{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
void *data = val;
int r = X86EMUL_CONTINUE;
@@ -3760,14 +4008,52 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val,
out:
return r;
}
+EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
+
+static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
+ gpa_t *gpa, struct x86_exception *exception,
+ bool write)
+{
+ u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+
+ if (vcpu_match_mmio_gva(vcpu, gva) &&
+ check_write_user_access(vcpu, write, access,
+ vcpu->arch.access)) {
+ *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
+ (gva & (PAGE_SIZE - 1));
+ trace_vcpu_match_mmio(gva, *gpa, write, false);
+ return 1;
+ }
+
+ if (write)
+ access |= PFERR_WRITE_MASK;
+
+ *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+
+ if (*gpa == UNMAPPED_GVA)
+ return -1;
+
+ /* For APIC access vmexit */
+ if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+ return 1;
+
+ if (vcpu_match_mmio_gpa(vcpu, *gpa)) {
+ trace_vcpu_match_mmio(gva, *gpa, write, true);
+ return 1;
+ }
+
+ return 0;
+}
-static int emulator_read_emulated(unsigned long addr,
+static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr,
void *val,
unsigned int bytes,
- struct x86_exception *exception,
- struct kvm_vcpu *vcpu)
+ struct x86_exception *exception)
{
- gpa_t gpa;
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ gpa_t gpa;
+ int handled, ret;
if (vcpu->mmio_read_completed) {
memcpy(val, vcpu->mmio_data, bytes);
@@ -3777,16 +4063,15 @@ static int emulator_read_emulated(unsigned long addr,
return X86EMUL_CONTINUE;
}
- gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception);
+ ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, false);
- if (gpa == UNMAPPED_GVA)
+ if (ret < 0)
return X86EMUL_PROPAGATE_FAULT;
- /* For APIC access vmexit */
- if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+ if (ret)
goto mmio;
- if (kvm_read_guest_virt(addr, val, bytes, vcpu, exception)
+ if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception)
== X86EMUL_CONTINUE)
return X86EMUL_CONTINUE;
@@ -3794,18 +4079,24 @@ mmio:
/*
* Is this MMIO handled locally?
*/
- if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) {
- trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val);
+ handled = vcpu_mmio_read(vcpu, gpa, bytes, val);
+
+ if (handled == bytes)
return X86EMUL_CONTINUE;
- }
+
+ gpa += handled;
+ bytes -= handled;
+ val += handled;
trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
vcpu->mmio_needed = 1;
vcpu->run->exit_reason = KVM_EXIT_MMIO;
vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
- vcpu->run->mmio.len = vcpu->mmio_size = bytes;
+ vcpu->mmio_size = bytes;
+ vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0;
+ vcpu->mmio_index = 0;
return X86EMUL_IO_NEEDED;
}
@@ -3828,15 +4119,16 @@ static int emulator_write_emulated_onepage(unsigned long addr,
struct x86_exception *exception,
struct kvm_vcpu *vcpu)
{
- gpa_t gpa;
+ gpa_t gpa;
+ int handled, ret;
- gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
+ ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, true);
- if (gpa == UNMAPPED_GVA)
+ if (ret < 0)
return X86EMUL_PROPAGATE_FAULT;
/* For APIC access vmexit */
- if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+ if (ret)
goto mmio;
if (emulator_write_phys(vcpu, gpa, val, bytes))
@@ -3847,25 +4139,35 @@ mmio:
/*
* Is this MMIO handled locally?
*/
- if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
+ handled = vcpu_mmio_write(vcpu, gpa, bytes, val);
+ if (handled == bytes)
return X86EMUL_CONTINUE;
+ gpa += handled;
+ bytes -= handled;
+ val += handled;
+
vcpu->mmio_needed = 1;
+ memcpy(vcpu->mmio_data, val, bytes);
vcpu->run->exit_reason = KVM_EXIT_MMIO;
vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
- vcpu->run->mmio.len = vcpu->mmio_size = bytes;
+ vcpu->mmio_size = bytes;
+ vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
- memcpy(vcpu->run->mmio.data, val, bytes);
+ memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8);
+ vcpu->mmio_index = 0;
return X86EMUL_CONTINUE;
}
-int emulator_write_emulated(unsigned long addr,
+int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr,
const void *val,
unsigned int bytes,
- struct x86_exception *exception,
- struct kvm_vcpu *vcpu)
+ struct x86_exception *exception)
{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+
/* Crossing a page boundary? */
if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
int rc, now;
@@ -3893,13 +4195,14 @@ int emulator_write_emulated(unsigned long addr,
(cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
#endif
-static int emulator_cmpxchg_emulated(unsigned long addr,
+static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr,
const void *old,
const void *new,
unsigned int bytes,
- struct x86_exception *exception,
- struct kvm_vcpu *vcpu)
+ struct x86_exception *exception)
{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
gpa_t gpa;
struct page *page;
char *kaddr;
@@ -3955,7 +4258,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
emul_write:
printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
- return emulator_write_emulated(addr, new, bytes, exception, vcpu);
+ return emulator_write_emulated(ctxt, addr, new, bytes, exception);
}
static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
@@ -3974,9 +4277,12 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
}
-static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
- unsigned int count, struct kvm_vcpu *vcpu)
+static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
+ int size, unsigned short port, void *val,
+ unsigned int count)
{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+
if (vcpu->arch.pio.count)
goto data_avail;
@@ -4004,10 +4310,12 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
return 0;
}
-static int emulator_pio_out_emulated(int size, unsigned short port,
- const void *val, unsigned int count,
- struct kvm_vcpu *vcpu)
+static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
+ int size, unsigned short port,
+ const void *val, unsigned int count)
{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+
trace_kvm_pio(1, port, size, count);
vcpu->arch.pio.port = port;
@@ -4037,10 +4345,9 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
return kvm_x86_ops->get_segment_base(vcpu, seg);
}
-int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
+static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
{
- kvm_mmu_invlpg(vcpu, address);
- return X86EMUL_CONTINUE;
+ kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
}
int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
@@ -4062,22 +4369,20 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
-int emulate_clts(struct kvm_vcpu *vcpu)
+static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
{
- kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
- kvm_x86_ops->fpu_activate(vcpu);
- return X86EMUL_CONTINUE;
+ kvm_emulate_wbinvd(emul_to_vcpu(ctxt));
}
-int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu)
+int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
{
- return _kvm_get_dr(vcpu, dr, dest);
+ return _kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
}
-int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu)
+int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
{
- return __kvm_set_dr(vcpu, dr, value);
+ return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
}
static u64 mk_cr_64(u64 curr_cr, u32 new_val)
@@ -4085,8 +4390,9 @@ static u64 mk_cr_64(u64 curr_cr, u32 new_val)
return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
}
-static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
+static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
unsigned long value;
switch (cr) {
@@ -4113,8 +4419,9 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
return value;
}
-static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
+static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
int res = 0;
switch (cr) {
@@ -4141,33 +4448,45 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
return res;
}
-static int emulator_get_cpl(struct kvm_vcpu *vcpu)
+static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
{
- return kvm_x86_ops->get_cpl(vcpu);
+ return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
}
-static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
+static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- kvm_x86_ops->get_gdt(vcpu, dt);
+ kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt);
}
-static void emulator_get_idt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
+static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- kvm_x86_ops->get_idt(vcpu, dt);
+ kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt);
}
-static unsigned long emulator_get_cached_segment_base(int seg,
- struct kvm_vcpu *vcpu)
+static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- return get_segment_base(vcpu, seg);
+ kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt);
}
-static bool emulator_get_cached_descriptor(struct desc_struct *desc, u32 *base3,
- int seg, struct kvm_vcpu *vcpu)
+static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
+{
+ kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt);
+}
+
+static unsigned long emulator_get_cached_segment_base(
+ struct x86_emulate_ctxt *ctxt, int seg)
+{
+ return get_segment_base(emul_to_vcpu(ctxt), seg);
+}
+
+static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
+ struct desc_struct *desc, u32 *base3,
+ int seg)
{
struct kvm_segment var;
- kvm_get_segment(vcpu, &var, seg);
+ kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
+ *selector = var.selector;
if (var.unusable)
return false;
@@ -4192,14 +4511,14 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, u32 *base3,
return true;
}
-static void emulator_set_cached_descriptor(struct desc_struct *desc, u32 base3,
- int seg, struct kvm_vcpu *vcpu)
+static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
+ struct desc_struct *desc, u32 base3,
+ int seg)
{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
struct kvm_segment var;
- /* needed to preserve selector */
- kvm_get_segment(vcpu, &var, seg);
-
+ var.selector = selector;
var.base = get_desc_base(desc);
#ifdef CONFIG_X86_64
var.base |= ((u64)base3) << 32;
@@ -4223,22 +4542,44 @@ static void emulator_set_cached_descriptor(struct desc_struct *desc, u32 base3,
return;
}
-static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu)
+static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
+ u32 msr_index, u64 *pdata)
+{
+ return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
+}
+
+static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
+ u32 msr_index, u64 data)
{
- struct kvm_segment kvm_seg;
+ return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
+}
- kvm_get_segment(vcpu, &kvm_seg, seg);
- return kvm_seg.selector;
+static void emulator_halt(struct x86_emulate_ctxt *ctxt)
+{
+ emul_to_vcpu(ctxt)->arch.halt_request = 1;
}
-static void emulator_set_segment_selector(u16 sel, int seg,
- struct kvm_vcpu *vcpu)
+static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt)
{
- struct kvm_segment kvm_seg;
+ preempt_disable();
+ kvm_load_guest_fpu(emul_to_vcpu(ctxt));
+ /*
+ * CR0.TS may reference the host fpu state, not the guest fpu state,
+ * so it may be clear at this point.
+ */
+ clts();
+}
- kvm_get_segment(vcpu, &kvm_seg, seg);
- kvm_seg.selector = sel;
- kvm_set_segment(vcpu, &kvm_seg, seg);
+static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt)
+{
+ preempt_enable();
+}
+
+static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage)
+{
+ return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
}
static struct x86_emulate_ops emulate_ops = {
@@ -4248,22 +4589,29 @@ static struct x86_emulate_ops emulate_ops = {
.read_emulated = emulator_read_emulated,
.write_emulated = emulator_write_emulated,
.cmpxchg_emulated = emulator_cmpxchg_emulated,
+ .invlpg = emulator_invlpg,
.pio_in_emulated = emulator_pio_in_emulated,
.pio_out_emulated = emulator_pio_out_emulated,
- .get_cached_descriptor = emulator_get_cached_descriptor,
- .set_cached_descriptor = emulator_set_cached_descriptor,
- .get_segment_selector = emulator_get_segment_selector,
- .set_segment_selector = emulator_set_segment_selector,
+ .get_segment = emulator_get_segment,
+ .set_segment = emulator_set_segment,
.get_cached_segment_base = emulator_get_cached_segment_base,
.get_gdt = emulator_get_gdt,
.get_idt = emulator_get_idt,
+ .set_gdt = emulator_set_gdt,
+ .set_idt = emulator_set_idt,
.get_cr = emulator_get_cr,
.set_cr = emulator_set_cr,
.cpl = emulator_get_cpl,
.get_dr = emulator_get_dr,
.set_dr = emulator_set_dr,
- .set_msr = kvm_set_msr,
- .get_msr = kvm_get_msr,
+ .set_msr = emulator_set_msr,
+ .get_msr = emulator_get_msr,
+ .halt = emulator_halt,
+ .wbinvd = emulator_wbinvd,
+ .fix_hypercall = emulator_fix_hypercall,
+ .get_fpu = emulator_get_fpu,
+ .put_fpu = emulator_put_fpu,
+ .intercept = emulator_intercept,
};
static void cache_all_regs(struct kvm_vcpu *vcpu)
@@ -4300,47 +4648,68 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
kvm_queue_exception(vcpu, ctxt->exception.vector);
}
+static void init_decode_cache(struct x86_emulate_ctxt *ctxt,
+ const unsigned long *regs)
+{
+ memset(&ctxt->twobyte, 0,
+ (void *)&ctxt->regs - (void *)&ctxt->twobyte);
+ memcpy(ctxt->regs, regs, sizeof(ctxt->regs));
+
+ ctxt->fetch.start = 0;
+ ctxt->fetch.end = 0;
+ ctxt->io_read.pos = 0;
+ ctxt->io_read.end = 0;
+ ctxt->mem_read.pos = 0;
+ ctxt->mem_read.end = 0;
+}
+
static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
{
- struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+ struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
int cs_db, cs_l;
+ /*
+ * TODO: fix emulate.c to use guest_read/write_register
+ * instead of direct ->regs accesses, can save hundred cycles
+ * on Intel for instructions that don't read/change RSP, for
+ * for example.
+ */
cache_all_regs(vcpu);
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
- vcpu->arch.emulate_ctxt.vcpu = vcpu;
- vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
- vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
- vcpu->arch.emulate_ctxt.mode =
- (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
- (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
- ? X86EMUL_MODE_VM86 : cs_l
- ? X86EMUL_MODE_PROT64 : cs_db
- ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
- memset(c, 0, sizeof(struct decode_cache));
- memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+ ctxt->eflags = kvm_get_rflags(vcpu);
+ ctxt->eip = kvm_rip_read(vcpu);
+ ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
+ (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 :
+ cs_l ? X86EMUL_MODE_PROT64 :
+ cs_db ? X86EMUL_MODE_PROT32 :
+ X86EMUL_MODE_PROT16;
+ ctxt->guest_mode = is_guest_mode(vcpu);
+
+ init_decode_cache(ctxt, vcpu->arch.regs);
+ vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
}
-int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq)
+int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
{
- struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+ struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
int ret;
init_emulate_ctxt(vcpu);
- vcpu->arch.emulate_ctxt.decode.op_bytes = 2;
- vcpu->arch.emulate_ctxt.decode.ad_bytes = 2;
- vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip;
- ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq);
+ ctxt->op_bytes = 2;
+ ctxt->ad_bytes = 2;
+ ctxt->_eip = ctxt->eip + inc_eip;
+ ret = emulate_int_real(ctxt, irq);
if (ret != X86EMUL_CONTINUE)
return EMULATE_FAIL;
- vcpu->arch.emulate_ctxt.eip = c->eip;
- memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
- kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
- kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+ ctxt->eip = ctxt->_eip;
+ memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
+ kvm_rip_write(vcpu, ctxt->eip);
+ kvm_set_rflags(vcpu, ctxt->eflags);
if (irq == NMI_VECTOR)
vcpu->arch.nmi_pending = false;
@@ -4401,28 +4770,21 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
int insn_len)
{
int r;
- struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+ struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ bool writeback = true;
kvm_clear_exception_queue(vcpu);
- vcpu->arch.mmio_fault_cr2 = cr2;
- /*
- * TODO: fix emulate.c to use guest_read/write_register
- * instead of direct ->regs accesses, can save hundred cycles
- * on Intel for instructions that don't read/change RSP, for
- * for example.
- */
- cache_all_regs(vcpu);
if (!(emulation_type & EMULTYPE_NO_DECODE)) {
init_emulate_ctxt(vcpu);
- vcpu->arch.emulate_ctxt.interruptibility = 0;
- vcpu->arch.emulate_ctxt.have_exception = false;
- vcpu->arch.emulate_ctxt.perm_ok = false;
+ ctxt->interruptibility = 0;
+ ctxt->have_exception = false;
+ ctxt->perm_ok = false;
- vcpu->arch.emulate_ctxt.only_vendor_specific_insn
+ ctxt->only_vendor_specific_insn
= emulation_type & EMULTYPE_TRAP_UD;
- r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len);
+ r = x86_decode_insn(ctxt, insn, insn_len);
trace_kvm_emulate_insn_start(vcpu);
++vcpu->stat.insn_emulation;
@@ -4438,16 +4800,22 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
}
if (emulation_type & EMULTYPE_SKIP) {
- kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip);
+ kvm_rip_write(vcpu, ctxt->_eip);
return EMULATE_DONE;
}
- /* this is needed for vmware backdor interface to work since it
+ /* this is needed for vmware backdoor interface to work since it
changes registers values during IO operation */
- memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+ if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
+ vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
+ memcpy(ctxt->regs, vcpu->arch.regs, sizeof ctxt->regs);
+ }
restart:
- r = x86_emulate_insn(&vcpu->arch.emulate_ctxt);
+ r = x86_emulate_insn(ctxt);
+
+ if (r == EMULATION_INTERCEPTED)
+ return EMULATE_DONE;
if (r == EMULATION_FAILED) {
if (reexecute_instruction(vcpu, cr2))
@@ -4456,27 +4824,33 @@ restart:
return handle_emulation_failure(vcpu);
}
- if (vcpu->arch.emulate_ctxt.have_exception) {
+ if (ctxt->have_exception) {
inject_emulated_exception(vcpu);
r = EMULATE_DONE;
} else if (vcpu->arch.pio.count) {
if (!vcpu->arch.pio.in)
vcpu->arch.pio.count = 0;
+ else
+ writeback = false;
r = EMULATE_DO_MMIO;
} else if (vcpu->mmio_needed) {
- if (vcpu->mmio_is_write)
- vcpu->mmio_needed = 0;
+ if (!vcpu->mmio_is_write)
+ writeback = false;
r = EMULATE_DO_MMIO;
} else if (r == EMULATION_RESTART)
goto restart;
else
r = EMULATE_DONE;
- toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
- kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
- kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
+ if (writeback) {
+ toggle_interruptibility(vcpu, ctxt->interruptibility);
+ kvm_set_rflags(vcpu, ctxt->eflags);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
+ vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
+ kvm_rip_write(vcpu, ctxt->eip);
+ } else
+ vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
return r;
}
@@ -4485,7 +4859,8 @@ EXPORT_SYMBOL_GPL(x86_emulate_instruction);
int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
{
unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
- int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu);
+ int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
+ size, port, &val, 1);
/* do not return to emulator after return from userspace */
vcpu->arch.pio.count = 0;
return ret;
@@ -4690,6 +5065,30 @@ void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
+static void kvm_set_mmio_spte_mask(void)
+{
+ u64 mask;
+ int maxphyaddr = boot_cpu_data.x86_phys_bits;
+
+ /*
+ * Set the reserved bits and the present bit of an paging-structure
+ * entry to generate page fault with PFER.RSV = 1.
+ */
+ mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr;
+ mask |= 1ull;
+
+#ifdef CONFIG_X86_64
+ /*
+ * If reserved bit is not supported, clear the present bit to disable
+ * mmio page fault.
+ */
+ if (maxphyaddr == 52)
+ mask &= ~1ull;
+#endif
+
+ kvm_mmu_set_mmio_spte_mask(mask);
+}
+
int kvm_arch_init(void *opaque)
{
int r;
@@ -4716,10 +5115,10 @@ int kvm_arch_init(void *opaque)
if (r)
goto out;
+ kvm_set_mmio_spte_mask();
kvm_init_msr_list();
kvm_x86_ops = ops;
- kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
PT_DIRTY_MASK, PT64_NX_MASK, 0);
@@ -4879,8 +5278,9 @@ out:
}
EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
-int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
+int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
char instruction[3];
unsigned long rip = kvm_rip_read(vcpu);
@@ -4893,21 +5293,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
kvm_x86_ops->patch_hypercall(vcpu, instruction);
- return emulator_write_emulated(rip, instruction, 3, NULL, vcpu);
-}
-
-void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
-{
- struct desc_ptr dt = { limit, base };
-
- kvm_x86_ops->set_gdt(vcpu, &dt);
-}
-
-void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
-{
- struct desc_ptr dt = { limit, base };
-
- kvm_x86_ops->set_idt(vcpu, &dt);
+ return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);
}
static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
@@ -5170,6 +5556,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
{
int r;
+ bool nmi_pending;
bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
vcpu->run->request_interrupt_window;
@@ -5207,19 +5594,28 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
r = 1;
goto out;
}
- if (kvm_check_request(KVM_REQ_NMI, vcpu))
- vcpu->arch.nmi_pending = true;
+ if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
+ record_steal_time(vcpu);
+
}
r = kvm_mmu_reload(vcpu);
if (unlikely(r))
goto out;
+ /*
+ * An NMI can be injected between local nmi_pending read and
+ * vcpu->arch.nmi_pending read inside inject_pending_event().
+ * But in that case, KVM_REQ_EVENT will be set, which makes
+ * the race described above benign.
+ */
+ nmi_pending = ACCESS_ONCE(vcpu->arch.nmi_pending);
+
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
inject_pending_event(vcpu);
/* enable NMI/IRQ window open exits if needed */
- if (vcpu->arch.nmi_pending)
+ if (nmi_pending)
kvm_x86_ops->enable_nmi_window(vcpu);
else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
kvm_x86_ops->enable_irq_window(vcpu);
@@ -5399,6 +5795,41 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
return r;
}
+static int complete_mmio(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *run = vcpu->run;
+ int r;
+
+ if (!(vcpu->arch.pio.count || vcpu->mmio_needed))
+ return 1;
+
+ if (vcpu->mmio_needed) {
+ vcpu->mmio_needed = 0;
+ if (!vcpu->mmio_is_write)
+ memcpy(vcpu->mmio_data + vcpu->mmio_index,
+ run->mmio.data, 8);
+ vcpu->mmio_index += 8;
+ if (vcpu->mmio_index < vcpu->mmio_size) {
+ run->exit_reason = KVM_EXIT_MMIO;
+ run->mmio.phys_addr = vcpu->mmio_phys_addr + vcpu->mmio_index;
+ memcpy(run->mmio.data, vcpu->mmio_data + vcpu->mmio_index, 8);
+ run->mmio.len = min(vcpu->mmio_size - vcpu->mmio_index, 8);
+ run->mmio.is_write = vcpu->mmio_is_write;
+ vcpu->mmio_needed = 1;
+ return 0;
+ }
+ if (vcpu->mmio_is_write)
+ return 1;
+ vcpu->mmio_read_completed = 1;
+ }
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
+ srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+ if (r != EMULATE_DONE)
+ return 0;
+ return 1;
+}
+
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
int r;
@@ -5425,20 +5856,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
}
}
- if (vcpu->arch.pio.count || vcpu->mmio_needed) {
- if (vcpu->mmio_needed) {
- memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
- vcpu->mmio_read_completed = 1;
- vcpu->mmio_needed = 0;
- }
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
- r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
- if (r != EMULATE_DONE) {
- r = 0;
- goto out;
- }
- }
+ r = complete_mmio(vcpu);
+ if (r <= 0)
+ goto out;
+
if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
kvm_register_write(vcpu, VCPU_REGS_RAX,
kvm_run->hypercall.ret);
@@ -5455,6 +5876,18 @@ out:
int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
+ if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
+ /*
+ * We are here if userspace calls get_regs() in the middle of
+ * instruction emulation. Registers state needs to be copied
+ * back from emulation context to vcpu. Usrapace shouldn't do
+ * that usually, but some bad designed PV devices (vmware
+ * backdoor interface) need this to work
+ */
+ struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
+ vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
+ }
regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
@@ -5482,6 +5915,9 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
+ vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
+ vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
+
kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
@@ -5578,21 +6014,20 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
bool has_error_code, u32 error_code)
{
- struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+ struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
int ret;
init_emulate_ctxt(vcpu);
- ret = emulator_task_switch(&vcpu->arch.emulate_ctxt,
- tss_selector, reason, has_error_code,
- error_code);
+ ret = emulator_task_switch(ctxt, tss_selector, reason,
+ has_error_code, error_code);
if (ret)
return EMULATE_FAIL;
- memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
- kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
- kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+ memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
+ kvm_rip_write(vcpu, ctxt->eip);
+ kvm_set_rflags(vcpu, ctxt->eflags);
kvm_make_request(KVM_REQ_EVENT, vcpu);
return EMULATE_DONE;
}
@@ -5870,12 +6305,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
if (r == 0)
r = kvm_mmu_setup(vcpu);
vcpu_put(vcpu);
- if (r < 0)
- goto free_vcpu;
- return 0;
-free_vcpu:
- kvm_x86_ops->vcpu_free(vcpu);
return r;
}
@@ -5903,6 +6333,7 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
kvm_make_request(KVM_REQ_EVENT, vcpu);
vcpu->arch.apf.msr_val = 0;
+ vcpu->arch.st.msr_val = 0;
kvmclock_reset(vcpu);
@@ -5974,8 +6405,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
}
vcpu->arch.pio_data = page_address(page);
- if (!kvm->arch.virtual_tsc_khz)
- kvm_arch_set_tsc_khz(kvm, max_tsc_khz);
+ kvm_init_tsc_catchup(vcpu, max_tsc_khz);
r = kvm_mmu_create(vcpu);
if (r < 0)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index c600da830ce..d36fe237c66 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -75,10 +75,54 @@ static inline u32 bit(int bitno)
return 1 << (bitno & 31);
}
+static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
+ gva_t gva, gfn_t gfn, unsigned access)
+{
+ vcpu->arch.mmio_gva = gva & PAGE_MASK;
+ vcpu->arch.access = access;
+ vcpu->arch.mmio_gfn = gfn;
+}
+
+/*
+ * Clear the mmio cache info for the given gva,
+ * specially, if gva is ~0ul, we clear all mmio cache info.
+ */
+static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ if (gva != (~0ul) && vcpu->arch.mmio_gva != (gva & PAGE_MASK))
+ return;
+
+ vcpu->arch.mmio_gva = 0;
+}
+
+static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva)
+{
+ if (vcpu->arch.mmio_gva && vcpu->arch.mmio_gva == (gva & PAGE_MASK))
+ return true;
+
+ return false;
+}
+
+static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ if (vcpu->arch.mmio_gfn && vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT)
+ return true;
+
+ return false;
+}
+
void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
-int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq);
+int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data);
+int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
+ gva_t addr, void *val, unsigned int bytes,
+ struct x86_exception *exception);
+
+int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
+ gva_t addr, void *val, unsigned int bytes,
+ struct x86_exception *exception);
+
#endif
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 1cd608973ce..13ee258442a 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -7,7 +7,7 @@
* kernel and insert a module (lg.ko) which allows us to run other Linux
* kernels the same way we'd run processes. We call the first kernel the Host,
* and the others the Guests. The program which sets up and configures Guests
- * (such as the example in Documentation/lguest/lguest.c) is called the
+ * (such as the example in Documentation/virtual/lguest/lguest.c) is called the
* Launcher.
*
* Secondly, we only run specially modified Guests, not normal kernels: setting
@@ -71,7 +71,8 @@
#include <asm/stackprotector.h>
#include <asm/reboot.h> /* for struct machine_ops */
-/*G:010 Welcome to the Guest!
+/*G:010
+ * Welcome to the Guest!
*
* The Guest in our tale is a simple creature: identical to the Host but
* behaving in simplified but equivalent ways. In particular, the Guest is the
@@ -190,15 +191,23 @@ static void lazy_hcall4(unsigned long call,
#endif
/*G:036
- * When lazy mode is turned off reset the per-cpu lazy mode variable and then
- * issue the do-nothing hypercall to flush any stored calls.
-:*/
+ * When lazy mode is turned off, we issue the do-nothing hypercall to
+ * flush any stored calls, and call the generic helper to reset the
+ * per-cpu lazy mode variable.
+ */
static void lguest_leave_lazy_mmu_mode(void)
{
hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
paravirt_leave_lazy_mmu();
}
+/*
+ * We also catch the end of context switch; we enter lazy mode for much of
+ * that too, so again we need to flush here.
+ *
+ * (Technically, this is lazy CPU mode, and normally we're in lazy MMU
+ * mode, but unlike Xen, lguest doesn't care about the difference).
+ */
static void lguest_end_context_switch(struct task_struct *next)
{
hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
@@ -391,7 +400,7 @@ static void lguest_load_tr_desc(void)
* giant ball of hair. Its entry in the current Intel manual runs to 28 pages.
*
* This instruction even it has its own Wikipedia entry. The Wikipedia entry
- * has been translated into 5 languages. I am not making this up!
+ * has been translated into 6 languages. I am not making this up!
*
* We could get funky here and identify ourselves as "GenuineLguest", but
* instead we just use the real "cpuid" instruction. Then I pretty much turned
@@ -458,7 +467,7 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
/*
* PAE systems can mark pages as non-executable. Linux calls this the
* NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced
- * Virus Protection). We just switch turn if off here, since we don't
+ * Virus Protection). We just switch it off here, since we don't
* support it.
*/
case 0x80000001:
@@ -520,17 +529,16 @@ static unsigned long lguest_read_cr2(void)
/* See lguest_set_pte() below. */
static bool cr3_changed = false;
+static unsigned long current_cr3;
/*
* cr3 is the current toplevel pagetable page: the principle is the same as
- * cr0. Keep a local copy, and tell the Host when it changes. The only
- * difference is that our local copy is in lguest_data because the Host needs
- * to set it upon our initial hypercall.
+ * cr0. Keep a local copy, and tell the Host when it changes.
*/
static void lguest_write_cr3(unsigned long cr3)
{
- lguest_data.pgdir = cr3;
lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
+ current_cr3 = cr3;
/* These two page tables are simple, linear, and used during boot */
if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table))
@@ -539,7 +547,7 @@ static void lguest_write_cr3(unsigned long cr3)
static unsigned long lguest_read_cr3(void)
{
- return lguest_data.pgdir;
+ return current_cr3;
}
/* cr4 is used to enable and disable PGE, but we don't care. */
@@ -641,7 +649,7 @@ static void lguest_write_cr4(unsigned long val)
/*
* The Guest calls this after it has set a second-level entry (pte), ie. to map
- * a page into a process' address space. Wetell the Host the toplevel and
+ * a page into a process' address space. We tell the Host the toplevel and
* address this corresponds to. The Guest uses one pagetable per process, so
* we need to tell the Host which one we're changing (mm->pgd).
*/
@@ -758,7 +766,7 @@ static void lguest_pmd_clear(pmd_t *pmdp)
static void lguest_flush_tlb_single(unsigned long addr)
{
/* Simply set it to zero: if it was not, it will fault back in. */
- lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0);
+ lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0);
}
/*
@@ -913,8 +921,6 @@ static struct clocksource lguest_clock = {
.rating = 200,
.read = lguest_clock_read,
.mask = CLOCKSOURCE_MASK(64),
- .mult = 1 << 22,
- .shift = 22,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
@@ -995,9 +1001,10 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
static void lguest_time_init(void)
{
/* Set up the timer interrupt (0) to go to our simple timer routine */
+ lguest_setup_irq(0);
irq_set_handler(0, lguest_time_irq);
- clocksource_register(&lguest_clock);
+ clocksource_register_hz(&lguest_clock, NSEC_PER_SEC);
/* We can't set cpumask in the initializer: damn C limitations! Set it
* here and register our timer device. */
@@ -1141,7 +1148,7 @@ static struct notifier_block paniced = {
static __init char *lguest_memory_setup(void)
{
/*
- *The Linux bootloader header contains an "e820" memory map: the
+ * The Linux bootloader header contains an "e820" memory map: the
* Launcher populated the first entry with our memory limit.
*/
e820_add_region(boot_params.e820_map[0].addr,
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index 4f420c2f2d5..6ddfe4fc23c 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -6,18 +6,22 @@
#include <asm/processor-flags.h>
/*G:020
- * Our story starts with the kernel booting into startup_32 in
- * arch/x86/kernel/head_32.S. It expects a boot header, which is created by
- * the bootloader (the Launcher in our case).
+
+ * Our story starts with the bzImage: booting starts at startup_32 in
+ * arch/x86/boot/compressed/head_32.S. This merely uncompresses the real
+ * kernel in place and then jumps into it: startup_32 in
+ * arch/x86/kernel/head_32.S. Both routines expects a boot header in the %esi
+ * register, which is created by the bootloader (the Launcher in our case).
*
* The startup_32 function does very little: it clears the uninitialized global
* C variables which we expect to be zero (ie. BSS) and then copies the boot
- * header and kernel command line somewhere safe. Finally it checks the
- * 'hardware_subarch' field. This was introduced in 2.6.24 for lguest and Xen:
- * if it's set to '1' (lguest's assigned number), then it calls us here.
+ * header and kernel command line somewhere safe, and populates some initial
+ * page tables. Finally it checks the 'hardware_subarch' field. This was
+ * introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's
+ * assigned number), then it calls us here.
*
* WARNING: be very careful here! We're running at addresses equal to physical
- * addesses (around 0), not above PAGE_OFFSET as most code expectes
+ * addresses (around 0), not above PAGE_OFFSET as most code expects
* (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any
* data without remembering to subtract __PAGE_OFFSET!
*
@@ -27,13 +31,18 @@
.section .init.text, "ax", @progbits
ENTRY(lguest_entry)
/*
- * We make the "initialization" hypercall now to tell the Host about
- * us, and also find out where it put our page tables.
+ * We make the "initialization" hypercall now to tell the Host where
+ * our lguest_data struct is.
*/
movl $LHCALL_LGUEST_INIT, %eax
movl $lguest_data - __PAGE_OFFSET, %ebx
int $LGUEST_TRAP_ENTRY
+ /* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */
+ movl $LHCALL_NEW_PGTABLE, %eax
+ movl $(initial_page_table - __PAGE_OFFSET), %ebx
+ int $LGUEST_TRAP_ENTRY
+
/* Set up the initial stack so we can run C code. */
movl $(init_thread_union+THREAD_SIZE),%esp
@@ -96,12 +105,8 @@ send_interrupts:
*/
pushl %eax
movl $LHCALL_SEND_INTERRUPTS, %eax
- /*
- * This is a vmcall instruction (same thing that KVM uses). Older
- * assembler versions might not know the "vmcall" instruction, so we
- * create one manually here.
- */
- .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
+ /* This is the actual hypercall trap. */
+ int $LGUEST_TRAP_ENTRY
/* Put eax back the way we found it. */
popl %eax
ret
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index f2479f19ddd..b00f6785da7 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -18,8 +18,10 @@ obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
lib-y := delay.o
lib-y += thunk_$(BITS).o
-lib-y += usercopy_$(BITS).o getuser.o putuser.o
+lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
lib-y += memcpy_$(BITS).o
+lib-$(CONFIG_SMP) += rwlock.o
+lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
obj-y += msr.o msr-reg.o msr-reg-export.o
@@ -29,7 +31,7 @@ ifeq ($(CONFIG_X86_32),y)
lib-y += atomic64_cx8_32.o
lib-y += checksum_32.o
lib-y += strstr_32.o
- lib-y += semaphore_32.o string_32.o
+ lib-y += string_32.o
lib-y += cmpxchg.o
ifneq ($(CONFIG_X86_CMPXCHG64),y)
lib-y += cmpxchg8b_emu.o atomic64_386_32.o
@@ -40,7 +42,6 @@ else
lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
lib-y += thunk_64.o clear_page_64.o copy_page_64.o
lib-y += memmove_64.o memset_64.o
- lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o
- lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o
+ lib-y += copy_user_64.o copy_user_nocache_64.o
lib-y += cmpxchg16b_emu.o
endif
diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c
index 540179e8e9f..042f6826bf5 100644
--- a/arch/x86/lib/atomic64_32.c
+++ b/arch/x86/lib/atomic64_32.c
@@ -4,7 +4,7 @@
#include <asm/processor.h>
#include <asm/cmpxchg.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
long long atomic64_read_cx8(long long, const atomic64_t *v);
EXPORT_SYMBOL(atomic64_read_cx8);
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index aa4326bfb24..f2145cfa12a 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,5 +1,6 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
+#include <asm/alternative-asm.h>
/*
* Zero a page.
@@ -14,6 +15,15 @@ ENTRY(clear_page_c)
CFI_ENDPROC
ENDPROC(clear_page_c)
+ENTRY(clear_page_c_e)
+ CFI_STARTPROC
+ movl $4096,%ecx
+ xorl %eax,%eax
+ rep stosb
+ ret
+ CFI_ENDPROC
+ENDPROC(clear_page_c_e)
+
ENTRY(clear_page)
CFI_STARTPROC
xorl %eax,%eax
@@ -38,21 +48,26 @@ ENTRY(clear_page)
.Lclear_page_end:
ENDPROC(clear_page)
- /* Some CPUs run faster using the string instructions.
- It is also a lot simpler. Use this when possible */
+ /*
+ * Some CPUs support enhanced REP MOVSB/STOSB instructions.
+ * It is recommended to use this when possible.
+ * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
+ * Otherwise, use original function.
+ *
+ */
#include <asm/cpufeature.h>
.section .altinstr_replacement,"ax"
1: .byte 0xeb /* jmp <disp8> */
.byte (clear_page_c - clear_page) - (2f - 1b) /* offset */
-2:
+2: .byte 0xeb /* jmp <disp8> */
+ .byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */
+3:
.previous
.section .altinstructions,"a"
- .align 8
- .quad clear_page
- .quad 1b
- .word X86_FEATURE_REP_GOOD
- .byte .Lclear_page_end - clear_page
- .byte 2b - 1b
+ altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\
+ .Lclear_page_end-clear_page, 2b-1b
+ altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \
+ .Lclear_page_end-clear_page,3b-2b
.previous
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 6fec2d1cebe..01c805ba535 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -2,6 +2,7 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
+#include <asm/alternative-asm.h>
ALIGN
copy_page_c:
@@ -110,10 +111,6 @@ ENDPROC(copy_page)
2:
.previous
.section .altinstructions,"a"
- .align 8
- .quad copy_page
- .quad 1b
- .word X86_FEATURE_REP_GOOD
- .byte .Lcopy_page_end - copy_page
- .byte 2b - 1b
+ altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD, \
+ .Lcopy_page_end-copy_page, 2b-1b
.previous
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 99e48261519..024840266ba 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -15,23 +15,30 @@
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
- .macro ALTERNATIVE_JUMP feature,orig,alt
+/*
+ * By placing feature2 after feature1 in altinstructions section, we logically
+ * implement:
+ * If CPU has feature2, jmp to alt2 is used
+ * else if CPU has feature1, jmp to alt1 is used
+ * else jmp to orig is used.
+ */
+ .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2
0:
.byte 0xe9 /* 32bit jump */
.long \orig-1f /* by default jump to orig */
1:
.section .altinstr_replacement,"ax"
2: .byte 0xe9 /* near jump with 32bit immediate */
- .long \alt-1b /* offset */ /* or alternatively to alt */
+ .long \alt1-1b /* offset */ /* or alternatively to alt1 */
+3: .byte 0xe9 /* near jump with 32bit immediate */
+ .long \alt2-1b /* offset */ /* or alternatively to alt2 */
.previous
+
.section .altinstructions,"a"
- .align 8
- .quad 0b
- .quad 2b
- .word \feature /* when feature is set */
- .byte 5
- .byte 5
+ altinstruction_entry 0b,2b,\feature1,5,5
+ altinstruction_entry 0b,3b,\feature2,5,5
.previous
.endm
@@ -72,8 +79,10 @@ ENTRY(_copy_to_user)
addq %rdx,%rcx
jc bad_to_user
cmpq TI_addr_limit(%rax),%rcx
- jae bad_to_user
- ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+ ja bad_to_user
+ ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
+ copy_user_generic_unrolled,copy_user_generic_string, \
+ copy_user_enhanced_fast_string
CFI_ENDPROC
ENDPROC(_copy_to_user)
@@ -85,8 +94,10 @@ ENTRY(_copy_from_user)
addq %rdx,%rcx
jc bad_from_user
cmpq TI_addr_limit(%rax),%rcx
- jae bad_from_user
- ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+ ja bad_from_user
+ ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
+ copy_user_generic_unrolled,copy_user_generic_string, \
+ copy_user_enhanced_fast_string
CFI_ENDPROC
ENDPROC(_copy_from_user)
@@ -255,3 +266,37 @@ ENTRY(copy_user_generic_string)
.previous
CFI_ENDPROC
ENDPROC(copy_user_generic_string)
+
+/*
+ * Some CPUs are adding enhanced REP MOVSB/STOSB instructions.
+ * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successful.
+ */
+ENTRY(copy_user_enhanced_fast_string)
+ CFI_STARTPROC
+ andl %edx,%edx
+ jz 2f
+ movl %edx,%ecx
+1: rep
+ movsb
+2: xorl %eax,%eax
+ ret
+
+ .section .fixup,"ax"
+12: movl %ecx,%edx /* ecx is zerorest also */
+ jmp copy_user_handle_tail
+ .previous
+
+ .section __ex_table,"a"
+ .align 8
+ .quad 1b,12b
+ .previous
+ CFI_ENDPROC
+ENDPROC(copy_user_enhanced_fast_string)
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 75ef61e35e3..efbf2a0ecde 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -4,6 +4,7 @@
#include <asm/cpufeature.h>
#include <asm/dwarf2.h>
+#include <asm/alternative-asm.h>
/*
* memcpy - Copy a memory block.
@@ -37,6 +38,23 @@
.Lmemcpy_e:
.previous
+/*
+ * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than
+ * memcpy_c. Use memcpy_c_e when possible.
+ *
+ * This gets patched over the unrolled variant (below) via the
+ * alternative instructions framework:
+ */
+ .section .altinstr_replacement, "ax", @progbits
+.Lmemcpy_c_e:
+ movq %rdi, %rax
+
+ movl %edx, %ecx
+ rep movsb
+ ret
+.Lmemcpy_e_e:
+ .previous
+
ENTRY(__memcpy)
ENTRY(memcpy)
CFI_STARTPROC
@@ -49,7 +67,7 @@ ENTRY(memcpy)
jb .Lhandle_tail
/*
- * We check whether memory false dependece could occur,
+ * We check whether memory false dependence could occur,
* then jump to corresponding copy mode.
*/
cmp %dil, %sil
@@ -171,21 +189,22 @@ ENDPROC(memcpy)
ENDPROC(__memcpy)
/*
- * Some CPUs run faster using the string copy instructions.
- * It is also a lot simpler. Use this when possible:
- */
-
- .section .altinstructions, "a"
- .align 8
- .quad memcpy
- .quad .Lmemcpy_c
- .word X86_FEATURE_REP_GOOD
-
- /*
+ * Some CPUs are adding enhanced REP MOVSB/STOSB feature
+ * If the feature is supported, memcpy_c_e() is the first choice.
+ * If enhanced rep movsb copy is not available, use fast string copy
+ * memcpy_c() when possible. This is faster and code is simpler than
+ * original memcpy().
+ * Otherwise, original memcpy() is used.
+ * In .altinstructions section, ERMS feature is placed after REG_GOOD
+ * feature to implement the right patch order.
+ *
* Replace only beginning, memcpy is used to apply alternatives,
* so it is silly to overwrite itself with nops - reboot is the
* only outcome...
*/
- .byte .Lmemcpy_e - .Lmemcpy_c
- .byte .Lmemcpy_e - .Lmemcpy_c
+ .section .altinstructions, "a"
+ altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
+ .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
+ altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
+ .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
.previous
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 0ecb8433e5a..ee164610ec4 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -8,6 +8,8 @@
#define _STRING_C
#include <linux/linkage.h>
#include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
#undef memmove
@@ -24,6 +26,7 @@
*/
ENTRY(memmove)
CFI_STARTPROC
+
/* Handle more 32bytes in loop */
mov %rdi, %rax
cmp $0x20, %rdx
@@ -31,8 +34,13 @@ ENTRY(memmove)
/* Decide forward/backward copy mode */
cmp %rdi, %rsi
- jb 2f
+ jge .Lmemmove_begin_forward
+ mov %rsi, %r8
+ add %rdx, %r8
+ cmp %rdi, %r8
+ jg 2f
+.Lmemmove_begin_forward:
/*
* movsq instruction have many startup latency
* so we handle small size by general register.
@@ -78,6 +86,8 @@ ENTRY(memmove)
rep movsq
movq %r11, (%r10)
jmp 13f
+.Lmemmove_end_forward:
+
/*
* Handle data backward by movsq.
*/
@@ -194,4 +204,20 @@ ENTRY(memmove)
13:
retq
CFI_ENDPROC
+
+ .section .altinstr_replacement,"ax"
+.Lmemmove_begin_forward_efs:
+ /* Forward moving data. */
+ movq %rdx, %rcx
+ rep movsb
+ retq
+.Lmemmove_end_forward_efs:
+ .previous
+
+ .section .altinstructions,"a"
+ altinstruction_entry .Lmemmove_begin_forward, \
+ .Lmemmove_begin_forward_efs,X86_FEATURE_ERMS, \
+ .Lmemmove_end_forward-.Lmemmove_begin_forward, \
+ .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
+ .previous
ENDPROC(memmove)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 09d34426965..79bd454b78a 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -2,9 +2,13 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
/*
- * ISO C memset - set a memory block to a byte value.
+ * ISO C memset - set a memory block to a byte value. This function uses fast
+ * string to get better performance than the original function. The code is
+ * simpler and shorter than the orignal function as well.
*
* rdi destination
* rsi value (char)
@@ -31,6 +35,28 @@
.Lmemset_e:
.previous
+/*
+ * ISO C memset - set a memory block to a byte value. This function uses
+ * enhanced rep stosb to override the fast string function.
+ * The code is simpler and shorter than the fast string function as well.
+ *
+ * rdi destination
+ * rsi value (char)
+ * rdx count (bytes)
+ *
+ * rax original destination
+ */
+ .section .altinstr_replacement, "ax", @progbits
+.Lmemset_c_e:
+ movq %rdi,%r9
+ movb %sil,%al
+ movl %edx,%ecx
+ rep stosb
+ movq %r9,%rax
+ ret
+.Lmemset_e_e:
+ .previous
+
ENTRY(memset)
ENTRY(__memset)
CFI_STARTPROC
@@ -112,16 +138,20 @@ ENTRY(__memset)
ENDPROC(memset)
ENDPROC(__memset)
- /* Some CPUs run faster using the string instructions.
- It is also a lot simpler. Use this when possible */
-
-#include <asm/cpufeature.h>
-
+ /* Some CPUs support enhanced REP MOVSB/STOSB feature.
+ * It is recommended to use this when possible.
+ *
+ * If enhanced REP MOVSB/STOSB feature is not available, use fast string
+ * instructions.
+ *
+ * Otherwise, use original memset function.
+ *
+ * In .altinstructions section, ERMS feature is placed after REG_GOOD
+ * feature to implement the right patch order.
+ */
.section .altinstructions,"a"
- .align 8
- .quad memset
- .quad .Lmemset_c
- .word X86_FEATURE_REP_GOOD
- .byte .Lfinal - memset
- .byte .Lmemset_e - .Lmemset_c
+ altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
+ .Lfinal-memset,.Lmemset_e-.Lmemset_c
+ altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
+ .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e
.previous
diff --git a/arch/x86/lib/rwlock.S b/arch/x86/lib/rwlock.S
new file mode 100644
index 00000000000..1cad22139c8
--- /dev/null
+++ b/arch/x86/lib/rwlock.S
@@ -0,0 +1,44 @@
+/* Slow paths of read/write spinlocks. */
+
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/frame.h>
+#include <asm/rwlock.h>
+
+#ifdef CONFIG_X86_32
+# define __lock_ptr eax
+#else
+# define __lock_ptr rdi
+#endif
+
+ENTRY(__write_lock_failed)
+ CFI_STARTPROC
+ FRAME
+0: LOCK_PREFIX
+ WRITE_LOCK_ADD($RW_LOCK_BIAS) (%__lock_ptr)
+1: rep; nop
+ cmpl $WRITE_LOCK_CMP, (%__lock_ptr)
+ jne 1b
+ LOCK_PREFIX
+ WRITE_LOCK_SUB($RW_LOCK_BIAS) (%__lock_ptr)
+ jnz 0b
+ ENDFRAME
+ ret
+ CFI_ENDPROC
+END(__write_lock_failed)
+
+ENTRY(__read_lock_failed)
+ CFI_STARTPROC
+ FRAME
+0: LOCK_PREFIX
+ READ_LOCK_SIZE(inc) (%__lock_ptr)
+1: rep; nop
+ READ_LOCK_SIZE(cmp) $1, (%__lock_ptr)
+ js 1b
+ LOCK_PREFIX
+ READ_LOCK_SIZE(dec) (%__lock_ptr)
+ js 0b
+ ENDFRAME
+ ret
+ CFI_ENDPROC
+END(__read_lock_failed)
diff --git a/arch/x86/lib/rwlock_64.S b/arch/x86/lib/rwlock_64.S
deleted file mode 100644
index 05ea55f7140..00000000000
--- a/arch/x86/lib/rwlock_64.S
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Slow paths of read/write spinlocks. */
-
-#include <linux/linkage.h>
-#include <asm/rwlock.h>
-#include <asm/alternative-asm.h>
-#include <asm/dwarf2.h>
-
-/* rdi: pointer to rwlock_t */
-ENTRY(__write_lock_failed)
- CFI_STARTPROC
- LOCK_PREFIX
- addl $RW_LOCK_BIAS,(%rdi)
-1: rep
- nop
- cmpl $RW_LOCK_BIAS,(%rdi)
- jne 1b
- LOCK_PREFIX
- subl $RW_LOCK_BIAS,(%rdi)
- jnz __write_lock_failed
- ret
- CFI_ENDPROC
-END(__write_lock_failed)
-
-/* rdi: pointer to rwlock_t */
-ENTRY(__read_lock_failed)
- CFI_STARTPROC
- LOCK_PREFIX
- incl (%rdi)
-1: rep
- nop
- cmpl $1,(%rdi)
- js 1b
- LOCK_PREFIX
- decl (%rdi)
- js __read_lock_failed
- ret
- CFI_ENDPROC
-END(__read_lock_failed)
diff --git a/arch/x86/lib/rwsem_64.S b/arch/x86/lib/rwsem.S
index 67743977398..5dff5f04246 100644
--- a/arch/x86/lib/rwsem_64.S
+++ b/arch/x86/lib/rwsem.S
@@ -1,4 +1,51 @@
/*
+ * x86 semaphore implementation.
+ *
+ * (C) Copyright 1999 Linus Torvalds
+ *
+ * Portions Copyright 1999 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/dwarf2.h>
+
+#define __ASM_HALF_REG(reg) __ASM_SEL(reg, e##reg)
+#define __ASM_HALF_SIZE(inst) __ASM_SEL(inst##w, inst##l)
+
+#ifdef CONFIG_X86_32
+
+/*
+ * The semaphore operations have a special calling sequence that
+ * allow us to do a simpler in-line version of them. These routines
+ * need to convert that sequence back into the C sequence when
+ * there is contention on the semaphore.
+ *
+ * %eax contains the semaphore pointer on entry. Save the C-clobbered
+ * registers (%eax, %edx and %ecx) except %eax whish is either a return
+ * value or just clobbered..
+ */
+
+#define save_common_regs \
+ pushl_cfi %ecx; CFI_REL_OFFSET ecx, 0
+
+#define restore_common_regs \
+ popl_cfi %ecx; CFI_RESTORE ecx
+
+ /* Avoid uglifying the argument copying x86-64 needs to do. */
+ .macro movq src, dst
+ .endm
+
+#else
+
+/*
* x86-64 rwsem wrappers
*
* This interfaces the inline asm code to the slow-path
@@ -16,12 +63,6 @@
* but %rdi, %rsi, %rcx, %r8-r11 always need saving.
*/
-#include <linux/linkage.h>
-#include <asm/rwlock.h>
-#include <asm/alternative-asm.h>
-#include <asm/frame.h>
-#include <asm/dwarf2.h>
-
#define save_common_regs \
pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \
pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \
@@ -40,16 +81,18 @@
popq_cfi %rsi; CFI_RESTORE rsi; \
popq_cfi %rdi; CFI_RESTORE rdi
+#endif
+
/* Fix up special calling conventions */
ENTRY(call_rwsem_down_read_failed)
CFI_STARTPROC
save_common_regs
- pushq_cfi %rdx
- CFI_REL_OFFSET rdx, 0
+ __ASM_SIZE(push,_cfi) %__ASM_REG(dx)
+ CFI_REL_OFFSET __ASM_REG(dx), 0
movq %rax,%rdi
call rwsem_down_read_failed
- popq_cfi %rdx
- CFI_RESTORE rdx
+ __ASM_SIZE(pop,_cfi) %__ASM_REG(dx)
+ CFI_RESTORE __ASM_REG(dx)
restore_common_regs
ret
CFI_ENDPROC
@@ -67,7 +110,8 @@ ENDPROC(call_rwsem_down_write_failed)
ENTRY(call_rwsem_wake)
CFI_STARTPROC
- decl %edx /* do nothing if still outstanding active readers */
+ /* do nothing if still outstanding active readers */
+ __ASM_HALF_SIZE(dec) %__ASM_HALF_REG(dx)
jnz 1f
save_common_regs
movq %rax,%rdi
@@ -77,16 +121,15 @@ ENTRY(call_rwsem_wake)
CFI_ENDPROC
ENDPROC(call_rwsem_wake)
-/* Fix up special calling conventions */
ENTRY(call_rwsem_downgrade_wake)
CFI_STARTPROC
save_common_regs
- pushq_cfi %rdx
- CFI_REL_OFFSET rdx, 0
+ __ASM_SIZE(push,_cfi) %__ASM_REG(dx)
+ CFI_REL_OFFSET __ASM_REG(dx), 0
movq %rax,%rdi
call rwsem_downgrade_wake
- popq_cfi %rdx
- CFI_RESTORE rdx
+ __ASM_SIZE(pop,_cfi) %__ASM_REG(dx)
+ CFI_RESTORE __ASM_REG(dx)
restore_common_regs
ret
CFI_ENDPROC
diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S
deleted file mode 100644
index 06691daa410..00000000000
--- a/arch/x86/lib/semaphore_32.S
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * i386 semaphore implementation.
- *
- * (C) Copyright 1999 Linus Torvalds
- *
- * Portions Copyright 1999 Red Hat, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
- */
-
-#include <linux/linkage.h>
-#include <asm/rwlock.h>
-#include <asm/alternative-asm.h>
-#include <asm/frame.h>
-#include <asm/dwarf2.h>
-
-/*
- * The semaphore operations have a special calling sequence that
- * allow us to do a simpler in-line version of them. These routines
- * need to convert that sequence back into the C sequence when
- * there is contention on the semaphore.
- *
- * %eax contains the semaphore pointer on entry. Save the C-clobbered
- * registers (%eax, %edx and %ecx) except %eax whish is either a return
- * value or just clobbered..
- */
- .section .sched.text, "ax"
-
-/*
- * rw spinlock fallbacks
- */
-#ifdef CONFIG_SMP
-ENTRY(__write_lock_failed)
- CFI_STARTPROC
- FRAME
-2: LOCK_PREFIX
- addl $ RW_LOCK_BIAS,(%eax)
-1: rep; nop
- cmpl $ RW_LOCK_BIAS,(%eax)
- jne 1b
- LOCK_PREFIX
- subl $ RW_LOCK_BIAS,(%eax)
- jnz 2b
- ENDFRAME
- ret
- CFI_ENDPROC
- ENDPROC(__write_lock_failed)
-
-ENTRY(__read_lock_failed)
- CFI_STARTPROC
- FRAME
-2: LOCK_PREFIX
- incl (%eax)
-1: rep; nop
- cmpl $1,(%eax)
- js 1b
- LOCK_PREFIX
- decl (%eax)
- js 2b
- ENDFRAME
- ret
- CFI_ENDPROC
- ENDPROC(__read_lock_failed)
-
-#endif
-
-#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
-
-/* Fix up special calling conventions */
-ENTRY(call_rwsem_down_read_failed)
- CFI_STARTPROC
- pushl_cfi %ecx
- CFI_REL_OFFSET ecx,0
- pushl_cfi %edx
- CFI_REL_OFFSET edx,0
- call rwsem_down_read_failed
- popl_cfi %edx
- popl_cfi %ecx
- ret
- CFI_ENDPROC
- ENDPROC(call_rwsem_down_read_failed)
-
-ENTRY(call_rwsem_down_write_failed)
- CFI_STARTPROC
- pushl_cfi %ecx
- CFI_REL_OFFSET ecx,0
- calll rwsem_down_write_failed
- popl_cfi %ecx
- ret
- CFI_ENDPROC
- ENDPROC(call_rwsem_down_write_failed)
-
-ENTRY(call_rwsem_wake)
- CFI_STARTPROC
- decw %dx /* do nothing if still outstanding active readers */
- jnz 1f
- pushl_cfi %ecx
- CFI_REL_OFFSET ecx,0
- call rwsem_wake
- popl_cfi %ecx
-1: ret
- CFI_ENDPROC
- ENDPROC(call_rwsem_wake)
-
-/* Fix up special calling conventions */
-ENTRY(call_rwsem_downgrade_wake)
- CFI_STARTPROC
- pushl_cfi %ecx
- CFI_REL_OFFSET ecx,0
- pushl_cfi %edx
- CFI_REL_OFFSET edx,0
- call rwsem_downgrade_wake
- popl_cfi %edx
- popl_cfi %ecx
- ret
- CFI_ENDPROC
- ENDPROC(call_rwsem_downgrade_wake)
-
-#endif
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index 782b082c9ff..a63efd6bb6a 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -5,50 +5,41 @@
* Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc.
* Subject to the GNU public license, v.2. No warranty of any kind.
*/
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/calling.h>
- #include <linux/linkage.h>
- #include <asm/dwarf2.h>
- #include <asm/calling.h>
- #include <asm/rwlock.h>
-
- /* rdi: arg1 ... normal C conventions. rax is saved/restored. */
- .macro thunk name,func
- .globl \name
-\name:
- CFI_STARTPROC
- SAVE_ARGS
- call \func
- jmp restore
- CFI_ENDPROC
- .endm
-
-#ifdef CONFIG_TRACE_IRQFLAGS
- /* put return address in rdi (arg1) */
- .macro thunk_ra name,func
+ /* rdi: arg1 ... normal C conventions. rax is saved/restored. */
+ .macro THUNK name, func, put_ret_addr_in_rdi=0
.globl \name
\name:
CFI_STARTPROC
+
+ /* this one pushes 9 elems, the next one would be %rIP */
SAVE_ARGS
- /* SAVE_ARGS pushs 9 elements */
- /* the next element would be the rip */
- movq 9*8(%rsp), %rdi
+
+ .if \put_ret_addr_in_rdi
+ movq_cfi_restore 9*8, rdi
+ .endif
+
call \func
jmp restore
CFI_ENDPROC
.endm
- thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller
- thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller
+#ifdef CONFIG_TRACE_IRQFLAGS
+ THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1
+ THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
- thunk lockdep_sys_exit_thunk,lockdep_sys_exit
+ THUNK lockdep_sys_exit_thunk,lockdep_sys_exit
#endif
-
+
/* SAVE_ARGS below is used only for the .cfi directives it contains. */
CFI_STARTPROC
SAVE_ARGS
restore:
RESTORE_ARGS
- ret
+ ret
CFI_ENDPROC
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
new file mode 100644
index 00000000000..97be9cb5448
--- /dev/null
+++ b/arch/x86/lib/usercopy.c
@@ -0,0 +1,43 @@
+/*
+ * User address space access functions.
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/highmem.h>
+#include <linux/module.h>
+
+/*
+ * best effort, GUP based copy_from_user() that is NMI-safe
+ */
+unsigned long
+copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
+{
+ unsigned long offset, addr = (unsigned long)from;
+ unsigned long size, len = 0;
+ struct page *page;
+ void *map;
+ int ret;
+
+ do {
+ ret = __get_user_pages_fast(addr, 1, 0, &page);
+ if (!ret)
+ break;
+
+ offset = addr & (PAGE_SIZE - 1);
+ size = min(PAGE_SIZE - offset, n - len);
+
+ map = kmap_atomic(page);
+ memcpy(to, map+offset, size);
+ kunmap_atomic(map);
+ put_page(page);
+
+ len += size;
+ to += size;
+ addr += size;
+
+ } while (len < n);
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(copy_from_user_nmi);
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 3e608edf995..3d11327c9ab 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -23,8 +23,8 @@ mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o
-obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o
-obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
+obj-$(CONFIG_AMD_NUMA) += amdtopology.o
+obj-$(CONFIG_ACPI_NUMA) += srat.o
obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology.c
index 0919c26820d..5247d01329c 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology.c
@@ -12,6 +12,7 @@
#include <linux/module.h>
#include <linux/nodemask.h>
#include <linux/memblock.h>
+#include <linux/bootmem.h>
#include <asm/io.h>
#include <linux/pci_ids.h>
@@ -69,10 +70,10 @@ static __init void early_get_boot_cpu_id(void)
int __init amd_numa_init(void)
{
- unsigned long start = PFN_PHYS(0);
- unsigned long end = PFN_PHYS(max_pfn);
+ u64 start = PFN_PHYS(0);
+ u64 end = PFN_PHYS(max_pfn);
unsigned numnodes;
- unsigned long prevbase;
+ u64 prevbase;
int i, j, nb;
u32 nodeid, reg;
unsigned int bits, cores, apicid_base;
@@ -95,7 +96,7 @@ int __init amd_numa_init(void)
prevbase = 0;
for (i = 0; i < 8; i++) {
- unsigned long base, limit;
+ u64 base, limit;
base = read_pci_config(0, nb, 1, 0x40 + i*8);
limit = read_pci_config(0, nb, 1, 0x44 + i*8);
@@ -107,18 +108,18 @@ int __init amd_numa_init(void)
continue;
}
if (nodeid >= numnodes) {
- pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid,
+ pr_info("Ignoring excess node %d (%Lx:%Lx)\n", nodeid,
base, limit);
continue;
}
if (!limit) {
- pr_info("Skipping node entry %d (base %lx)\n",
+ pr_info("Skipping node entry %d (base %Lx)\n",
i, base);
continue;
}
if ((base >> 8) & 3 || (limit >> 8) & 3) {
- pr_err("Node %d using interleaving mode %lx/%lx\n",
+ pr_err("Node %d using interleaving mode %Lx/%Lx\n",
nodeid, (base >> 8) & 3, (limit >> 8) & 3);
return -EINVAL;
}
@@ -150,19 +151,19 @@ int __init amd_numa_init(void)
continue;
}
if (limit < base) {
- pr_err("Node %d bogus settings %lx-%lx.\n",
+ pr_err("Node %d bogus settings %Lx-%Lx.\n",
nodeid, base, limit);
continue;
}
/* Could sort here, but pun for now. Should not happen anyroads. */
if (prevbase > base) {
- pr_err("Node map not sorted %lx,%lx\n",
+ pr_err("Node map not sorted %Lx,%Lx\n",
prevbase, base);
return -EINVAL;
}
- pr_info("Node %d MemBase %016lx Limit %016lx\n",
+ pr_info("Node %d MemBase %016Lx Limit %016Lx\n",
nodeid, base, limit);
prevbase = base;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 20e3f8702d1..4d09df054e3 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -12,6 +12,7 @@
#include <linux/mmiotrace.h> /* kmmio_handler, ... */
#include <linux/perf_event.h> /* perf_sw_event */
#include <linux/hugetlb.h> /* hstate_index_to_shift */
+#include <linux/prefetch.h> /* prefetchw */
#include <asm/traps.h> /* dotraplinkage, ... */
#include <asm/pgalloc.h> /* pgd_*(), ... */
@@ -822,16 +823,30 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
force_sig_info_fault(SIGBUS, code, address, tsk, fault);
}
-static noinline void
+static noinline int
mm_fault_error(struct pt_regs *regs, unsigned long error_code,
unsigned long address, unsigned int fault)
{
+ /*
+ * Pagefault was interrupted by SIGKILL. We have no reason to
+ * continue pagefault.
+ */
+ if (fatal_signal_pending(current)) {
+ if (!(fault & VM_FAULT_RETRY))
+ up_read(&current->mm->mmap_sem);
+ if (!(error_code & PF_USER))
+ no_context(regs, error_code, address);
+ return 1;
+ }
+ if (!(fault & VM_FAULT_ERROR))
+ return 0;
+
if (fault & VM_FAULT_OOM) {
/* Kernel mode? Handle exceptions or die: */
if (!(error_code & PF_USER)) {
up_read(&current->mm->mmap_sem);
no_context(regs, error_code, address);
- return;
+ return 1;
}
out_of_memory(regs, error_code, address);
@@ -842,6 +857,7 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
else
BUG();
}
+ return 1;
}
static int spurious_fault_check(unsigned long error_code, pte_t *pte)
@@ -964,7 +980,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
struct mm_struct *mm;
int fault;
int write = error_code & PF_WRITE;
- unsigned int flags = FAULT_FLAG_ALLOW_RETRY |
+ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
(write ? FAULT_FLAG_WRITE : 0);
tsk = current;
@@ -1043,7 +1059,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
if (unlikely(error_code & PF_RSVD))
pgtable_bad(regs, error_code, address);
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
/*
* If we're in an interrupt, have no user context or are running
@@ -1132,9 +1148,9 @@ good_area:
*/
fault = handle_mm_fault(mm, vma, address, flags);
- if (unlikely(fault & VM_FAULT_ERROR)) {
- mm_fault_error(regs, error_code, address, fault);
- return;
+ if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
+ if (mm_fault_error(regs, error_code, address, fault))
+ return;
}
/*
@@ -1145,11 +1161,11 @@ good_area:
if (flags & FAULT_FLAG_ALLOW_RETRY) {
if (fault & VM_FAULT_MAJOR) {
tsk->maj_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
regs, address);
} else {
tsk->min_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
regs, address);
}
if (fault & VM_FAULT_RETRY) {
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index d4203988504..f581a18c0d4 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -72,7 +72,7 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
if (!vma_shareable(vma, addr))
return;
- spin_lock(&mapping->i_mmap_lock);
+ mutex_lock(&mapping->i_mmap_mutex);
vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
if (svma == vma)
continue;
@@ -97,7 +97,7 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
put_page(virt_to_page(spte));
spin_unlock(&mm->page_table_lock);
out:
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
}
/*
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 286d289b039..30326443ab8 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -16,8 +16,6 @@
#include <asm/tlb.h>
#include <asm/proto.h>
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-
unsigned long __initdata pgt_buf_start;
unsigned long __meminitdata pgt_buf_end;
unsigned long __meminitdata pgt_buf_top;
@@ -81,6 +79,11 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT);
}
+void __init native_pagetable_reserve(u64 start, u64 end)
+{
+ memblock_x86_reserve_range(start, end, "PGTABLE");
+}
+
struct map_range {
unsigned long start;
unsigned long end;
@@ -272,9 +275,24 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
__flush_tlb_all();
+ /*
+ * Reserve the kernel pagetable pages we used (pgt_buf_start -
+ * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top)
+ * so that they can be reused for other purposes.
+ *
+ * On native it just means calling memblock_x86_reserve_range, on Xen it
+ * also means marking RW the pagetable pages that we allocated before
+ * but that haven't been used.
+ *
+ * In fact on xen we mark RO the whole range pgt_buf_start -
+ * pgt_buf_top, because we have to make sure that when
+ * init_memory_mapping reaches the pagetable pages area, it maps
+ * RO all the pagetable pages, including the ones that are beyond
+ * pgt_buf_end at that time.
+ */
if (!after_bootmem && pgt_buf_end > pgt_buf_start)
- memblock_x86_reserve_range(pgt_buf_start << PAGE_SHIFT,
- pgt_buf_end << PAGE_SHIFT, "PGTABLE");
+ x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
+ PFN_PHYS(pgt_buf_end));
if (!after_bootmem)
early_memtest(start, end);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 80088f99419..29f7c6d9817 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -678,8 +678,10 @@ static void __init zone_sizes_init(void)
{
unsigned long max_zone_pfns[MAX_NR_ZONES];
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+#ifdef CONFIG_ZONE_DMA
max_zone_pfns[ZONE_DMA] =
virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+#endif
max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
#ifdef CONFIG_HIGHMEM
max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
@@ -716,6 +718,7 @@ void __init paging_init(void)
* NOTE: at this point the bootmem allocator is fully available.
*/
olpc_dt_build_devicetree();
+ sparse_memory_present_with_active_regions(MAX_NUMNODES);
sparse_init();
zone_sizes_init();
}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 79423358728..bbaaa005bf0 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -28,6 +28,7 @@
#include <linux/poison.h>
#include <linux/dma-mapping.h>
#include <linux/module.h>
+#include <linux/memory.h>
#include <linux/memory_hotplug.h>
#include <linux/nmi.h>
#include <linux/gfp.h>
@@ -616,7 +617,9 @@ void __init paging_init(void)
unsigned long max_zone_pfns[MAX_NR_ZONES];
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+#ifdef CONFIG_ZONE_DMA
max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
+#endif
max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
max_zone_pfns[ZONE_NORMAL] = max_pfn;
@@ -679,14 +682,6 @@ int arch_add_memory(int nid, u64 start, u64 size)
}
EXPORT_SYMBOL_GPL(arch_add_memory);
-#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
-int memory_add_physaddr_to_nid(u64 start)
-{
- return 0;
-}
-EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
-#endif
-
#endif /* CONFIG_MEMORY_HOTPLUG */
static struct kcore_list kcore_vsyscall;
@@ -901,8 +896,6 @@ const char *arch_vma_name(struct vm_area_struct *vma)
}
#ifdef CONFIG_X86_UV
-#define MIN_MEMORY_BLOCK_SIZE (1 << SECTION_SIZE_BITS)
-
unsigned long memory_block_size_bytes(void)
{
if (is_uv_system()) {
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 0369843511d..be1ef574ce9 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -91,13 +91,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
return (__force void __iomem *)phys_to_virt(phys_addr);
/*
- * Check if the request spans more than any BAR in the iomem resource
- * tree.
- */
- WARN_ONCE(iomem_map_sanity_check(phys_addr, size),
- KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
-
- /*
* Don't allow anybody to remap normal RAM that we're using..
*/
last_pfn = last_addr >> PAGE_SHIFT;
@@ -170,6 +163,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
ret_addr = (void __iomem *) (vaddr + offset);
mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
+ /*
+ * Check if the request spans more than any BAR in the iomem resource
+ * tree.
+ */
+ WARN_ONCE(iomem_map_sanity_check(unaligned_phys_addr, unaligned_size),
+ KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
+
return ret_addr;
err_free_area:
free_vm_area(area);
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
index 704a37ceddd..dab41876cdd 100644
--- a/arch/x86/mm/kmemcheck/error.c
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -185,7 +185,7 @@ void kmemcheck_error_save(enum kmemcheck_shadow state,
e->trace.entries = e->trace_entries;
e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
e->trace.skip = 0;
- save_stack_trace_regs(&e->trace, regs);
+ save_stack_trace_regs(regs, &e->trace);
/* Round address down to nearest 16 bytes */
shadow_copy = kmemcheck_shadow_lookup(address
diff --git a/arch/x86/mm/memblock.c b/arch/x86/mm/memblock.c
index aa1169392b8..992da5ec5a6 100644
--- a/arch/x86/mm/memblock.c
+++ b/arch/x86/mm/memblock.c
@@ -8,7 +8,7 @@
#include <linux/range.h>
/* Check for already reserved areas */
-static bool __init check_with_memblock_reserved_size(u64 *addrp, u64 *sizep, u64 align)
+bool __init memblock_x86_check_reserved_size(u64 *addrp, u64 *sizep, u64 align)
{
struct memblock_region *r;
u64 addr = *addrp, last;
@@ -59,7 +59,7 @@ u64 __init memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align)
if (addr >= ei_last)
continue;
*sizep = ei_last - addr;
- while (check_with_memblock_reserved_size(&addr, sizep, align))
+ while (memblock_x86_check_reserved_size(&addr, sizep, align))
;
if (*sizep)
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 3adff7dcc14..67421f38a21 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -34,7 +34,7 @@
#include <asm/pgtable.h>
#include <linux/mmiotrace.h>
#include <asm/e820.h> /* for ISA_START_ADDRESS */
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <linux/percpu.h>
#include <linux/cpu.h>
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 9559d360fde..fbeaaf41661 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -1,11 +1,39 @@
/* Common code for 32 and 64-bit NUMA */
-#include <linux/topology.h>
-#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/init.h>
#include <linux/bootmem.h>
-#include <asm/numa.h>
+#include <linux/memblock.h>
+#include <linux/mmzone.h>
+#include <linux/ctype.h>
+#include <linux/module.h>
+#include <linux/nodemask.h>
+#include <linux/sched.h>
+#include <linux/topology.h>
+
+#include <asm/e820.h>
+#include <asm/proto.h>
+#include <asm/dma.h>
#include <asm/acpi.h>
+#include <asm/amd_nb.h>
+
+#include "numa_internal.h"
int __initdata numa_off;
+nodemask_t numa_nodes_parsed __initdata;
+
+struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
+EXPORT_SYMBOL(node_data);
+
+static struct numa_meminfo numa_meminfo
+#ifndef CONFIG_MEMORY_HOTPLUG
+__initdata
+#endif
+;
+
+static int numa_distance_cnt;
+static u8 *numa_distance;
static __init int numa_setup(char *opt)
{
@@ -32,6 +60,15 @@ s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
};
+int __cpuinit numa_cpu_node(int cpu)
+{
+ int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
+
+ if (apicid != BAD_APICID)
+ return __apicid_to_node[apicid];
+ return NUMA_NO_NODE;
+}
+
cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
EXPORT_SYMBOL(node_to_cpumask_map);
@@ -95,6 +132,422 @@ void __init setup_node_to_cpumask_map(void)
pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
}
+static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
+ struct numa_meminfo *mi)
+{
+ /* ignore zero length blks */
+ if (start == end)
+ return 0;
+
+ /* whine about and ignore invalid blks */
+ if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
+ pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
+ nid, start, end);
+ return 0;
+ }
+
+ if (mi->nr_blks >= NR_NODE_MEMBLKS) {
+ pr_err("NUMA: too many memblk ranges\n");
+ return -EINVAL;
+ }
+
+ mi->blk[mi->nr_blks].start = start;
+ mi->blk[mi->nr_blks].end = end;
+ mi->blk[mi->nr_blks].nid = nid;
+ mi->nr_blks++;
+ return 0;
+}
+
+/**
+ * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
+ * @idx: Index of memblk to remove
+ * @mi: numa_meminfo to remove memblk from
+ *
+ * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
+ * decrementing @mi->nr_blks.
+ */
+void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
+{
+ mi->nr_blks--;
+ memmove(&mi->blk[idx], &mi->blk[idx + 1],
+ (mi->nr_blks - idx) * sizeof(mi->blk[0]));
+}
+
+/**
+ * numa_add_memblk - Add one numa_memblk to numa_meminfo
+ * @nid: NUMA node ID of the new memblk
+ * @start: Start address of the new memblk
+ * @end: End address of the new memblk
+ *
+ * Add a new memblk to the default numa_meminfo.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init numa_add_memblk(int nid, u64 start, u64 end)
+{
+ return numa_add_memblk_to(nid, start, end, &numa_meminfo);
+}
+
+/* Initialize NODE_DATA for a node on the local memory */
+static void __init setup_node_data(int nid, u64 start, u64 end)
+{
+ const u64 nd_low = PFN_PHYS(MAX_DMA_PFN);
+ const u64 nd_high = PFN_PHYS(max_pfn_mapped);
+ const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
+ bool remapped = false;
+ u64 nd_pa;
+ void *nd;
+ int tnid;
+
+ /*
+ * Don't confuse VM with a node that doesn't have the
+ * minimum amount of memory:
+ */
+ if (end && (end - start) < NODE_MIN_SIZE)
+ return;
+
+ /* initialize remap allocator before aligning to ZONE_ALIGN */
+ init_alloc_remap(nid, start, end);
+
+ start = roundup(start, ZONE_ALIGN);
+
+ printk(KERN_INFO "Initmem setup node %d %016Lx-%016Lx\n",
+ nid, start, end);
+
+ /*
+ * Allocate node data. Try remap allocator first, node-local
+ * memory and then any node. Never allocate in DMA zone.
+ */
+ nd = alloc_remap(nid, nd_size);
+ if (nd) {
+ nd_pa = __pa(nd);
+ remapped = true;
+ } else {
+ nd_pa = memblock_x86_find_in_range_node(nid, nd_low, nd_high,
+ nd_size, SMP_CACHE_BYTES);
+ if (nd_pa == MEMBLOCK_ERROR)
+ nd_pa = memblock_find_in_range(nd_low, nd_high,
+ nd_size, SMP_CACHE_BYTES);
+ if (nd_pa == MEMBLOCK_ERROR) {
+ pr_err("Cannot find %zu bytes in node %d\n",
+ nd_size, nid);
+ return;
+ }
+ memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA");
+ nd = __va(nd_pa);
+ }
+
+ /* report and initialize */
+ printk(KERN_INFO " NODE_DATA [%016Lx - %016Lx]%s\n",
+ nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : "");
+ tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
+ if (!remapped && tnid != nid)
+ printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid);
+
+ node_data[nid] = nd;
+ memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
+ NODE_DATA(nid)->node_id = nid;
+ NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT;
+ NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT;
+
+ node_set_online(nid);
+}
+
+/**
+ * numa_cleanup_meminfo - Cleanup a numa_meminfo
+ * @mi: numa_meminfo to clean up
+ *
+ * Sanitize @mi by merging and removing unncessary memblks. Also check for
+ * conflicts and clear unused memblks.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
+{
+ const u64 low = 0;
+ const u64 high = PFN_PHYS(max_pfn);
+ int i, j, k;
+
+ /* first, trim all entries */
+ for (i = 0; i < mi->nr_blks; i++) {
+ struct numa_memblk *bi = &mi->blk[i];
+
+ /* make sure all blocks are inside the limits */
+ bi->start = max(bi->start, low);
+ bi->end = min(bi->end, high);
+
+ /* and there's no empty block */
+ if (bi->start >= bi->end)
+ numa_remove_memblk_from(i--, mi);
+ }
+
+ /* merge neighboring / overlapping entries */
+ for (i = 0; i < mi->nr_blks; i++) {
+ struct numa_memblk *bi = &mi->blk[i];
+
+ for (j = i + 1; j < mi->nr_blks; j++) {
+ struct numa_memblk *bj = &mi->blk[j];
+ u64 start, end;
+
+ /*
+ * See whether there are overlapping blocks. Whine
+ * about but allow overlaps of the same nid. They
+ * will be merged below.
+ */
+ if (bi->end > bj->start && bi->start < bj->end) {
+ if (bi->nid != bj->nid) {
+ pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
+ bi->nid, bi->start, bi->end,
+ bj->nid, bj->start, bj->end);
+ return -EINVAL;
+ }
+ pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
+ bi->nid, bi->start, bi->end,
+ bj->start, bj->end);
+ }
+
+ /*
+ * Join together blocks on the same node, holes
+ * between which don't overlap with memory on other
+ * nodes.
+ */
+ if (bi->nid != bj->nid)
+ continue;
+ start = min(bi->start, bj->start);
+ end = max(bi->end, bj->end);
+ for (k = 0; k < mi->nr_blks; k++) {
+ struct numa_memblk *bk = &mi->blk[k];
+
+ if (bi->nid == bk->nid)
+ continue;
+ if (start < bk->end && end > bk->start)
+ break;
+ }
+ if (k < mi->nr_blks)
+ continue;
+ printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%Lx,%Lx)\n",
+ bi->nid, bi->start, bi->end, bj->start, bj->end,
+ start, end);
+ bi->start = start;
+ bi->end = end;
+ numa_remove_memblk_from(j--, mi);
+ }
+ }
+
+ /* clear unused ones */
+ for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
+ mi->blk[i].start = mi->blk[i].end = 0;
+ mi->blk[i].nid = NUMA_NO_NODE;
+ }
+
+ return 0;
+}
+
+/*
+ * Set nodes, which have memory in @mi, in *@nodemask.
+ */
+static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
+ const struct numa_meminfo *mi)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
+ if (mi->blk[i].start != mi->blk[i].end &&
+ mi->blk[i].nid != NUMA_NO_NODE)
+ node_set(mi->blk[i].nid, *nodemask);
+}
+
+/**
+ * numa_reset_distance - Reset NUMA distance table
+ *
+ * The current table is freed. The next numa_set_distance() call will
+ * create a new one.
+ */
+void __init numa_reset_distance(void)
+{
+ size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
+
+ /* numa_distance could be 1LU marking allocation failure, test cnt */
+ if (numa_distance_cnt)
+ memblock_x86_free_range(__pa(numa_distance),
+ __pa(numa_distance) + size);
+ numa_distance_cnt = 0;
+ numa_distance = NULL; /* enable table creation */
+}
+
+static int __init numa_alloc_distance(void)
+{
+ nodemask_t nodes_parsed;
+ size_t size;
+ int i, j, cnt = 0;
+ u64 phys;
+
+ /* size the new table and allocate it */
+ nodes_parsed = numa_nodes_parsed;
+ numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
+
+ for_each_node_mask(i, nodes_parsed)
+ cnt = i;
+ cnt++;
+ size = cnt * cnt * sizeof(numa_distance[0]);
+
+ phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
+ size, PAGE_SIZE);
+ if (phys == MEMBLOCK_ERROR) {
+ pr_warning("NUMA: Warning: can't allocate distance table!\n");
+ /* don't retry until explicitly reset */
+ numa_distance = (void *)1LU;
+ return -ENOMEM;
+ }
+ memblock_x86_reserve_range(phys, phys + size, "NUMA DIST");
+
+ numa_distance = __va(phys);
+ numa_distance_cnt = cnt;
+
+ /* fill with the default distances */
+ for (i = 0; i < cnt; i++)
+ for (j = 0; j < cnt; j++)
+ numa_distance[i * cnt + j] = i == j ?
+ LOCAL_DISTANCE : REMOTE_DISTANCE;
+ printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
+
+ return 0;
+}
+
+/**
+ * numa_set_distance - Set NUMA distance from one NUMA to another
+ * @from: the 'from' node to set distance
+ * @to: the 'to' node to set distance
+ * @distance: NUMA distance
+ *
+ * Set the distance from node @from to @to to @distance. If distance table
+ * doesn't exist, one which is large enough to accommodate all the currently
+ * known nodes will be created.
+ *
+ * If such table cannot be allocated, a warning is printed and further
+ * calls are ignored until the distance table is reset with
+ * numa_reset_distance().
+ *
+ * If @from or @to is higher than the highest known node at the time of
+ * table creation or @distance doesn't make sense, the call is ignored.
+ * This is to allow simplification of specific NUMA config implementations.
+ */
+void __init numa_set_distance(int from, int to, int distance)
+{
+ if (!numa_distance && numa_alloc_distance() < 0)
+ return;
+
+ if (from >= numa_distance_cnt || to >= numa_distance_cnt) {
+ printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
+ from, to, distance);
+ return;
+ }
+
+ if ((u8)distance != distance ||
+ (from == to && distance != LOCAL_DISTANCE)) {
+ pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
+ from, to, distance);
+ return;
+ }
+
+ numa_distance[from * numa_distance_cnt + to] = distance;
+}
+
+int __node_distance(int from, int to)
+{
+ if (from >= numa_distance_cnt || to >= numa_distance_cnt)
+ return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
+ return numa_distance[from * numa_distance_cnt + to];
+}
+EXPORT_SYMBOL(__node_distance);
+
+/*
+ * Sanity check to catch more bad NUMA configurations (they are amazingly
+ * common). Make sure the nodes cover all memory.
+ */
+static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
+{
+ u64 numaram, e820ram;
+ int i;
+
+ numaram = 0;
+ for (i = 0; i < mi->nr_blks; i++) {
+ u64 s = mi->blk[i].start >> PAGE_SHIFT;
+ u64 e = mi->blk[i].end >> PAGE_SHIFT;
+ numaram += e - s;
+ numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
+ if ((s64)numaram < 0)
+ numaram = 0;
+ }
+
+ e820ram = max_pfn - (memblock_x86_hole_size(0,
+ PFN_PHYS(max_pfn)) >> PAGE_SHIFT);
+ /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
+ if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
+ printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n",
+ (numaram << PAGE_SHIFT) >> 20,
+ (e820ram << PAGE_SHIFT) >> 20);
+ return false;
+ }
+ return true;
+}
+
+static int __init numa_register_memblks(struct numa_meminfo *mi)
+{
+ unsigned long uninitialized_var(pfn_align);
+ int i, nid;
+
+ /* Account for nodes with cpus and no memory */
+ node_possible_map = numa_nodes_parsed;
+ numa_nodemask_from_meminfo(&node_possible_map, mi);
+ if (WARN_ON(nodes_empty(node_possible_map)))
+ return -EINVAL;
+
+ for (i = 0; i < mi->nr_blks; i++)
+ memblock_x86_register_active_regions(mi->blk[i].nid,
+ mi->blk[i].start >> PAGE_SHIFT,
+ mi->blk[i].end >> PAGE_SHIFT);
+
+ /* for out of order entries */
+ sort_node_map();
+
+ /*
+ * If sections array is gonna be used for pfn -> nid mapping, check
+ * whether its granularity is fine enough.
+ */
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+ pfn_align = node_map_pfn_alignment();
+ if (pfn_align && pfn_align < PAGES_PER_SECTION) {
+ printk(KERN_WARNING "Node alignment %LuMB < min %LuMB, rejecting NUMA config\n",
+ PFN_PHYS(pfn_align) >> 20,
+ PFN_PHYS(PAGES_PER_SECTION) >> 20);
+ return -EINVAL;
+ }
+#endif
+ if (!numa_meminfo_cover_memory(mi))
+ return -EINVAL;
+
+ /* Finally register nodes. */
+ for_each_node_mask(nid, node_possible_map) {
+ u64 start = PFN_PHYS(max_pfn);
+ u64 end = 0;
+
+ for (i = 0; i < mi->nr_blks; i++) {
+ if (nid != mi->blk[i].nid)
+ continue;
+ start = min(mi->blk[i].start, start);
+ end = max(mi->blk[i].end, end);
+ }
+
+ if (start < end)
+ setup_node_data(nid, start, end);
+ }
+
+ return 0;
+}
+
/*
* There are unfortunately some poorly designed mainboards around that
* only connect memory to a single CPU. This breaks the 1:1 cpu->node
@@ -102,7 +555,7 @@ void __init setup_node_to_cpumask_map(void)
* as the number of CPUs is not known yet. We round robin the existing
* nodes.
*/
-void __init numa_init_array(void)
+static void __init numa_init_array(void)
{
int rr, i;
@@ -117,6 +570,95 @@ void __init numa_init_array(void)
}
}
+static int __init numa_init(int (*init_func)(void))
+{
+ int i;
+ int ret;
+
+ for (i = 0; i < MAX_LOCAL_APIC; i++)
+ set_apicid_to_node(i, NUMA_NO_NODE);
+
+ nodes_clear(numa_nodes_parsed);
+ nodes_clear(node_possible_map);
+ nodes_clear(node_online_map);
+ memset(&numa_meminfo, 0, sizeof(numa_meminfo));
+ remove_all_active_ranges();
+ numa_reset_distance();
+
+ ret = init_func();
+ if (ret < 0)
+ return ret;
+ ret = numa_cleanup_meminfo(&numa_meminfo);
+ if (ret < 0)
+ return ret;
+
+ numa_emulation(&numa_meminfo, numa_distance_cnt);
+
+ ret = numa_register_memblks(&numa_meminfo);
+ if (ret < 0)
+ return ret;
+
+ for (i = 0; i < nr_cpu_ids; i++) {
+ int nid = early_cpu_to_node(i);
+
+ if (nid == NUMA_NO_NODE)
+ continue;
+ if (!node_online(nid))
+ numa_clear_node(i);
+ }
+ numa_init_array();
+ return 0;
+}
+
+/**
+ * dummy_numa_init - Fallback dummy NUMA init
+ *
+ * Used if there's no underlying NUMA architecture, NUMA initialization
+ * fails, or NUMA is disabled on the command line.
+ *
+ * Must online at least one node and add memory blocks that cover all
+ * allowed memory. This function must not fail.
+ */
+static int __init dummy_numa_init(void)
+{
+ printk(KERN_INFO "%s\n",
+ numa_off ? "NUMA turned off" : "No NUMA configuration found");
+ printk(KERN_INFO "Faking a node at %016Lx-%016Lx\n",
+ 0LLU, PFN_PHYS(max_pfn));
+
+ node_set(0, numa_nodes_parsed);
+ numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
+
+ return 0;
+}
+
+/**
+ * x86_numa_init - Initialize NUMA
+ *
+ * Try each configured NUMA initialization method until one succeeds. The
+ * last fallback is dummy single node config encomapssing whole memory and
+ * never fails.
+ */
+void __init x86_numa_init(void)
+{
+ if (!numa_off) {
+#ifdef CONFIG_X86_NUMAQ
+ if (!numa_init(numaq_numa_init))
+ return;
+#endif
+#ifdef CONFIG_ACPI_NUMA
+ if (!numa_init(x86_acpi_numa_init))
+ return;
+#endif
+#ifdef CONFIG_AMD_NUMA
+ if (!numa_init(amd_numa_init))
+ return;
+#endif
+ }
+
+ numa_init(dummy_numa_init);
+}
+
static __init int find_near_online_node(int node)
{
int n, val;
@@ -213,53 +755,48 @@ int early_cpu_to_node(int cpu)
return per_cpu(x86_cpu_to_node_map, cpu);
}
-struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable)
+void debug_cpumask_set_cpu(int cpu, int node, bool enable)
{
- int node = early_cpu_to_node(cpu);
struct cpumask *mask;
char buf[64];
if (node == NUMA_NO_NODE) {
/* early_cpu_to_node() already emits a warning and trace */
- return NULL;
+ return;
}
mask = node_to_cpumask_map[node];
if (!mask) {
pr_err("node_to_cpumask_map[%i] NULL\n", node);
dump_stack();
- return NULL;
+ return;
}
+ if (enable)
+ cpumask_set_cpu(cpu, mask);
+ else
+ cpumask_clear_cpu(cpu, mask);
+
cpulist_scnprintf(buf, sizeof(buf), mask);
printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
enable ? "numa_add_cpu" : "numa_remove_cpu",
cpu, node, buf);
- return mask;
+ return;
}
# ifndef CONFIG_NUMA_EMU
-static void __cpuinit numa_set_cpumask(int cpu, int enable)
+static void __cpuinit numa_set_cpumask(int cpu, bool enable)
{
- struct cpumask *mask;
-
- mask = debug_cpumask_set_cpu(cpu, enable);
- if (!mask)
- return;
-
- if (enable)
- cpumask_set_cpu(cpu, mask);
- else
- cpumask_clear_cpu(cpu, mask);
+ debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable);
}
void __cpuinit numa_add_cpu(int cpu)
{
- numa_set_cpumask(cpu, 1);
+ numa_set_cpumask(cpu, true);
}
void __cpuinit numa_remove_cpu(int cpu)
{
- numa_set_cpumask(cpu, 0);
+ numa_set_cpumask(cpu, false);
}
# endif /* !CONFIG_NUMA_EMU */
@@ -287,3 +824,18 @@ const struct cpumask *cpumask_of_node(int node)
EXPORT_SYMBOL(cpumask_of_node);
#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int memory_add_physaddr_to_nid(u64 start)
+{
+ struct numa_meminfo *mi = &numa_meminfo;
+ int nid = mi->blk[0].nid;
+ int i;
+
+ for (i = 0; i < mi->nr_blks; i++)
+ if (mi->blk[i].start <= start && mi->blk[i].end > start)
+ nid = mi->blk[i].nid;
+ return nid;
+}
+EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+#endif
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index bde3906420d..3adebe7e536 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -22,39 +22,11 @@
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
-#include <linux/mm.h>
#include <linux/bootmem.h>
#include <linux/memblock.h>
-#include <linux/mmzone.h>
-#include <linux/highmem.h>
-#include <linux/initrd.h>
-#include <linux/nodemask.h>
#include <linux/module.h>
-#include <linux/kexec.h>
-#include <linux/pfn.h>
-#include <linux/swap.h>
-#include <linux/acpi.h>
-
-#include <asm/e820.h>
-#include <asm/setup.h>
-#include <asm/mmzone.h>
-#include <asm/bios_ebda.h>
-#include <asm/proto.h>
-
-struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
-EXPORT_SYMBOL(node_data);
-
-/*
- * numa interface - we expect the numa architecture specific code to have
- * populated the following initialisation.
- *
- * 1) node_online_map - the map of all nodes configured (online) in the system
- * 2) node_start_pfn - the starting page frame number for a node
- * 3) node_end_pfn - the ending page fram number for a node
- */
-unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly;
-unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
+#include "numa_internal.h"
#ifdef CONFIG_DISCONTIGMEM
/*
@@ -69,7 +41,7 @@ unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
* physnode_map[16-31] = 1;
* physnode_map[32- ] = -1;
*/
-s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1};
+s8 physnode_map[MAX_SECTIONS] __read_mostly = { [0 ... (MAX_SECTIONS - 1)] = -1};
EXPORT_SYMBOL(physnode_map);
void memory_present(int nid, unsigned long start, unsigned long end)
@@ -80,8 +52,8 @@ void memory_present(int nid, unsigned long start, unsigned long end)
nid, start, end);
printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid);
printk(KERN_DEBUG " ");
- for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) {
- physnode_map[pfn / PAGES_PER_ELEMENT] = nid;
+ for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
+ physnode_map[pfn / PAGES_PER_SECTION] = nid;
printk(KERN_CONT "%lx ", pfn);
}
printk(KERN_CONT "\n");
@@ -99,108 +71,46 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
}
#endif
-extern unsigned long find_max_low_pfn(void);
extern unsigned long highend_pfn, highstart_pfn;
#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
-unsigned long node_remap_size[MAX_NUMNODES];
static void *node_remap_start_vaddr[MAX_NUMNODES];
void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
-static unsigned long kva_start_pfn;
-static unsigned long kva_pages;
-
-int __cpuinit numa_cpu_node(int cpu)
-{
- return apic->x86_32_numa_cpu_node(cpu);
-}
-
-/*
- * FLAT - support for basic PC memory model with discontig enabled, essentially
- * a single node with all available processors in it with a flat
- * memory map.
- */
-int __init get_memcfg_numa_flat(void)
-{
- printk(KERN_DEBUG "NUMA - single node, flat memory mode\n");
-
- node_start_pfn[0] = 0;
- node_end_pfn[0] = max_pfn;
- memblock_x86_register_active_regions(0, 0, max_pfn);
- memory_present(0, 0, max_pfn);
- node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn);
-
- /* Indicate there is one node available. */
- nodes_clear(node_online_map);
- node_set_online(0);
- return 1;
-}
-
-/*
- * Find the highest page frame number we have available for the node
- */
-static void __init propagate_e820_map_node(int nid)
-{
- if (node_end_pfn[nid] > max_pfn)
- node_end_pfn[nid] = max_pfn;
- /*
- * if a user has given mem=XXXX, then we need to make sure
- * that the node _starts_ before that, too, not just ends
- */
- if (node_start_pfn[nid] > max_pfn)
- node_start_pfn[nid] = max_pfn;
- BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]);
-}
-
-/*
- * Allocate memory for the pg_data_t for this node via a crude pre-bootmem
- * method. For node zero take this from the bottom of memory, for
- * subsequent nodes place them at node_remap_start_vaddr which contains
- * node local data in physically node local memory. See setup_memory()
- * for details.
- */
-static void __init allocate_pgdat(int nid)
-{
- char buf[16];
-
- if (node_has_online_mem(nid) && node_remap_start_vaddr[nid])
- NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
- else {
- unsigned long pgdat_phys;
- pgdat_phys = memblock_find_in_range(min_low_pfn<<PAGE_SHIFT,
- max_pfn_mapped<<PAGE_SHIFT,
- sizeof(pg_data_t),
- PAGE_SIZE);
- NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT));
- memset(buf, 0, sizeof(buf));
- sprintf(buf, "NODE_DATA %d", nid);
- memblock_x86_reserve_range(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf);
- }
- printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n",
- nid, (unsigned long)NODE_DATA(nid));
-}
-
/*
- * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel
- * virtual address space (KVA) is reserved and portions of nodes are mapped
- * using it. This is to allow node-local memory to be allocated for
- * structures that would normally require ZONE_NORMAL. The memory is
- * allocated with alloc_remap() and callers should be prepared to allocate
- * from the bootmem allocator instead.
+ * Remap memory allocator
*/
static unsigned long node_remap_start_pfn[MAX_NUMNODES];
static void *node_remap_end_vaddr[MAX_NUMNODES];
static void *node_remap_alloc_vaddr[MAX_NUMNODES];
-static unsigned long node_remap_offset[MAX_NUMNODES];
+/**
+ * alloc_remap - Allocate remapped memory
+ * @nid: NUMA node to allocate memory from
+ * @size: The size of allocation
+ *
+ * Allocate @size bytes from the remap area of NUMA node @nid. The
+ * size of the remap area is predetermined by init_alloc_remap() and
+ * only the callers considered there should call this function. For
+ * more info, please read the comment on top of init_alloc_remap().
+ *
+ * The caller must be ready to handle allocation failure from this
+ * function and fall back to regular memory allocator in such cases.
+ *
+ * CONTEXT:
+ * Single CPU early boot context.
+ *
+ * RETURNS:
+ * Pointer to the allocated memory on success, %NULL on failure.
+ */
void *alloc_remap(int nid, unsigned long size)
{
void *allocation = node_remap_alloc_vaddr[nid];
size = ALIGN(size, L1_CACHE_BYTES);
- if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid])
+ if (!allocation || (allocation + size) > node_remap_end_vaddr[nid])
return NULL;
node_remap_alloc_vaddr[nid] += size;
@@ -209,26 +119,6 @@ void *alloc_remap(int nid, unsigned long size)
return allocation;
}
-static void __init remap_numa_kva(void)
-{
- void *vaddr;
- unsigned long pfn;
- int node;
-
- for_each_online_node(node) {
- printk(KERN_DEBUG "remap_numa_kva: node %d\n", node);
- for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
- vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
- printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n",
- (unsigned long)vaddr,
- node_remap_start_pfn[node] + pfn);
- set_pmd_pfn((ulong) vaddr,
- node_remap_start_pfn[node] + pfn,
- PAGE_KERNEL_LARGE);
- }
- }
-}
-
#ifdef CONFIG_HIBERNATION
/**
* resume_map_numa_kva - add KVA mapping to the temporary page tables created
@@ -240,15 +130,16 @@ void resume_map_numa_kva(pgd_t *pgd_base)
int node;
for_each_online_node(node) {
- unsigned long start_va, start_pfn, size, pfn;
+ unsigned long start_va, start_pfn, nr_pages, pfn;
start_va = (unsigned long)node_remap_start_vaddr[node];
start_pfn = node_remap_start_pfn[node];
- size = node_remap_size[node];
+ nr_pages = (node_remap_end_vaddr[node] -
+ node_remap_start_vaddr[node]) >> PAGE_SHIFT;
printk(KERN_DEBUG "%s: node %d\n", __func__, node);
- for (pfn = 0; pfn < size; pfn += PTRS_PER_PTE) {
+ for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) {
unsigned long vaddr = start_va + (pfn << PAGE_SHIFT);
pgd_t *pgd = pgd_base + pgd_index(vaddr);
pud_t *pud = pud_offset(pgd, vaddr);
@@ -264,132 +155,89 @@ void resume_map_numa_kva(pgd_t *pgd_base)
}
#endif
-static __init unsigned long calculate_numa_remap_pages(void)
+/**
+ * init_alloc_remap - Initialize remap allocator for a NUMA node
+ * @nid: NUMA node to initizlie remap allocator for
+ *
+ * NUMA nodes may end up without any lowmem. As allocating pgdat and
+ * memmap on a different node with lowmem is inefficient, a special
+ * remap allocator is implemented which can be used by alloc_remap().
+ *
+ * For each node, the amount of memory which will be necessary for
+ * pgdat and memmap is calculated and two memory areas of the size are
+ * allocated - one in the node and the other in lowmem; then, the area
+ * in the node is remapped to the lowmem area.
+ *
+ * As pgdat and memmap must be allocated in lowmem anyway, this
+ * doesn't waste lowmem address space; however, the actual lowmem
+ * which gets remapped over is wasted. The amount shouldn't be
+ * problematic on machines this feature will be used.
+ *
+ * Initialization failure isn't fatal. alloc_remap() is used
+ * opportunistically and the callers will fall back to other memory
+ * allocation mechanisms on failure.
+ */
+void __init init_alloc_remap(int nid, u64 start, u64 end)
{
- int nid;
- unsigned long size, reserve_pages = 0;
-
- for_each_online_node(nid) {
- u64 node_kva_target;
- u64 node_kva_final;
-
- /*
- * The acpi/srat node info can show hot-add memroy zones
- * where memory could be added but not currently present.
- */
- printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
- nid, node_start_pfn[nid], node_end_pfn[nid]);
- if (node_start_pfn[nid] > max_pfn)
- continue;
- if (!node_end_pfn[nid])
- continue;
- if (node_end_pfn[nid] > max_pfn)
- node_end_pfn[nid] = max_pfn;
-
- /* ensure the remap includes space for the pgdat. */
- size = node_remap_size[nid] + sizeof(pg_data_t);
-
- /* convert size to large (pmd size) pages, rounding up */
- size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
- /* now the roundup is correct, convert to PAGE_SIZE pages */
- size = size * PTRS_PER_PTE;
-
- node_kva_target = round_down(node_end_pfn[nid] - size,
- PTRS_PER_PTE);
- node_kva_target <<= PAGE_SHIFT;
- do {
- node_kva_final = memblock_find_in_range(node_kva_target,
- ((u64)node_end_pfn[nid])<<PAGE_SHIFT,
- ((u64)size)<<PAGE_SHIFT,
- LARGE_PAGE_BYTES);
- node_kva_target -= LARGE_PAGE_BYTES;
- } while (node_kva_final == MEMBLOCK_ERROR &&
- (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
-
- if (node_kva_final == MEMBLOCK_ERROR)
- panic("Can not get kva ram\n");
-
- node_remap_size[nid] = size;
- node_remap_offset[nid] = reserve_pages;
- reserve_pages += size;
- printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of"
- " node %d at %llx\n",
- size, nid, node_kva_final>>PAGE_SHIFT);
-
- /*
- * prevent kva address below max_low_pfn want it on system
- * with less memory later.
- * layout will be: KVA address , KVA RAM
- *
- * we are supposed to only record the one less then max_low_pfn
- * but we could have some hole in high memory, and it will only
- * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide
- * to use it as free.
- * So memblock_x86_reserve_range here, hope we don't run out of that array
- */
- memblock_x86_reserve_range(node_kva_final,
- node_kva_final+(((u64)size)<<PAGE_SHIFT),
- "KVA RAM");
-
- node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT;
- }
- printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n",
- reserve_pages);
- return reserve_pages;
-}
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long end_pfn = end >> PAGE_SHIFT;
+ unsigned long size, pfn;
+ u64 node_pa, remap_pa;
+ void *remap_va;
-static void init_remap_allocator(int nid)
-{
- node_remap_start_vaddr[nid] = pfn_to_kaddr(
- kva_start_pfn + node_remap_offset[nid]);
- node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
- (node_remap_size[nid] * PAGE_SIZE);
- node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
- ALIGN(sizeof(pg_data_t), PAGE_SIZE);
-
- printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid,
- (ulong) node_remap_start_vaddr[nid],
- (ulong) node_remap_end_vaddr[nid]);
+ /*
+ * The acpi/srat node info can show hot-add memroy zones where
+ * memory could be added but not currently present.
+ */
+ printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
+ nid, start_pfn, end_pfn);
+
+ /* calculate the necessary space aligned to large page size */
+ size = node_memmap_size_bytes(nid, start_pfn, end_pfn);
+ size += ALIGN(sizeof(pg_data_t), PAGE_SIZE);
+ size = ALIGN(size, LARGE_PAGE_BYTES);
+
+ /* allocate node memory and the lowmem remap area */
+ node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES);
+ if (node_pa == MEMBLOCK_ERROR) {
+ pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n",
+ size, nid);
+ return;
+ }
+ memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM");
+
+ remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT,
+ max_low_pfn << PAGE_SHIFT,
+ size, LARGE_PAGE_BYTES);
+ if (remap_pa == MEMBLOCK_ERROR) {
+ pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n",
+ size, nid);
+ memblock_x86_free_range(node_pa, node_pa + size);
+ return;
+ }
+ memblock_x86_reserve_range(remap_pa, remap_pa + size, "KVA PG");
+ remap_va = phys_to_virt(remap_pa);
+
+ /* perform actual remap */
+ for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE)
+ set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT),
+ (node_pa >> PAGE_SHIFT) + pfn,
+ PAGE_KERNEL_LARGE);
+
+ /* initialize remap allocator parameters */
+ node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT;
+ node_remap_start_vaddr[nid] = remap_va;
+ node_remap_end_vaddr[nid] = remap_va + size;
+ node_remap_alloc_vaddr[nid] = remap_va;
+
+ printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n",
+ nid, node_pa, node_pa + size, remap_va, remap_va + size);
}
void __init initmem_init(void)
{
- int nid;
- long kva_target_pfn;
-
- /*
- * When mapping a NUMA machine we allocate the node_mem_map arrays
- * from node local memory. They are then mapped directly into KVA
- * between zone normal and vmalloc space. Calculate the size of
- * this space and use it to adjust the boundary between ZONE_NORMAL
- * and ZONE_HIGHMEM.
- */
-
- get_memcfg_numa();
- numa_init_array();
-
- kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);
+ x86_numa_init();
- kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
- do {
- kva_start_pfn = memblock_find_in_range(kva_target_pfn<<PAGE_SHIFT,
- max_low_pfn<<PAGE_SHIFT,
- kva_pages<<PAGE_SHIFT,
- PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT;
- kva_target_pfn -= PTRS_PER_PTE;
- } while (kva_start_pfn == MEMBLOCK_ERROR && kva_target_pfn > min_low_pfn);
-
- if (kva_start_pfn == MEMBLOCK_ERROR)
- panic("Can not get kva space\n");
-
- printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n",
- kva_start_pfn, max_low_pfn);
- printk(KERN_INFO "max_pfn = %lx\n", max_pfn);
-
- /* avoid clash with initrd */
- memblock_x86_reserve_range(kva_start_pfn<<PAGE_SHIFT,
- (kva_start_pfn + kva_pages)<<PAGE_SHIFT,
- "KVA PG");
#ifdef CONFIG_HIGHMEM
highstart_pfn = highend_pfn = max_pfn;
if (max_pfn > max_low_pfn)
@@ -409,51 +257,9 @@ void __init initmem_init(void)
printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
(ulong) pfn_to_kaddr(max_low_pfn));
- for_each_online_node(nid) {
- init_remap_allocator(nid);
-
- allocate_pgdat(nid);
- }
- remap_numa_kva();
printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
(ulong) pfn_to_kaddr(highstart_pfn));
- for_each_online_node(nid)
- propagate_e820_map_node(nid);
-
- for_each_online_node(nid) {
- memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
- NODE_DATA(nid)->node_id = nid;
- }
setup_bootmem_allocator();
}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-static int paddr_to_nid(u64 addr)
-{
- int nid;
- unsigned long pfn = PFN_DOWN(addr);
-
- for_each_node(nid)
- if (node_start_pfn[nid] <= pfn &&
- pfn < node_end_pfn[nid])
- return nid;
-
- return -1;
-}
-
-/*
- * This function is used to ask node id BEFORE memmap and mem_section's
- * initialization (pfn_to_nid() can't be used yet).
- * If _PXM is not defined on ACPI's DSDT, node id must be found by this.
- */
-int memory_add_physaddr_to_nid(u64 addr)
-{
- int nid = paddr_to_nid(addr);
- return (nid >= 0) ? nid : 0;
-}
-
-EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
-#endif
-
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index e8c00cc7203..dd27f401f0a 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -2,646 +2,13 @@
* Generic VM initialization for x86-64 NUMA setups.
* Copyright 2002,2003 Andi Kleen, SuSE Labs.
*/
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/init.h>
#include <linux/bootmem.h>
-#include <linux/memblock.h>
-#include <linux/mmzone.h>
-#include <linux/ctype.h>
-#include <linux/module.h>
-#include <linux/nodemask.h>
-#include <linux/sched.h>
-#include <linux/acpi.h>
-
-#include <asm/e820.h>
-#include <asm/proto.h>
-#include <asm/dma.h>
-#include <asm/acpi.h>
-#include <asm/amd_nb.h>
#include "numa_internal.h"
-struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
-EXPORT_SYMBOL(node_data);
-
-nodemask_t numa_nodes_parsed __initdata;
-
-struct memnode memnode;
-
-static unsigned long __initdata nodemap_addr;
-static unsigned long __initdata nodemap_size;
-
-static struct numa_meminfo numa_meminfo __initdata;
-
-static int numa_distance_cnt;
-static u8 *numa_distance;
-
-/*
- * Given a shift value, try to populate memnodemap[]
- * Returns :
- * 1 if OK
- * 0 if memnodmap[] too small (of shift too small)
- * -1 if node overlap or lost ram (shift too big)
- */
-static int __init populate_memnodemap(const struct numa_meminfo *mi, int shift)
-{
- unsigned long addr, end;
- int i, res = -1;
-
- memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
- for (i = 0; i < mi->nr_blks; i++) {
- addr = mi->blk[i].start;
- end = mi->blk[i].end;
- if (addr >= end)
- continue;
- if ((end >> shift) >= memnodemapsize)
- return 0;
- do {
- if (memnodemap[addr >> shift] != NUMA_NO_NODE)
- return -1;
- memnodemap[addr >> shift] = mi->blk[i].nid;
- addr += (1UL << shift);
- } while (addr < end);
- res = 1;
- }
- return res;
-}
-
-static int __init allocate_cachealigned_memnodemap(void)
-{
- unsigned long addr;
-
- memnodemap = memnode.embedded_map;
- if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map))
- return 0;
-
- addr = 0x8000;
- nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
- nodemap_addr = memblock_find_in_range(addr, get_max_mapped(),
- nodemap_size, L1_CACHE_BYTES);
- if (nodemap_addr == MEMBLOCK_ERROR) {
- printk(KERN_ERR
- "NUMA: Unable to allocate Memory to Node hash map\n");
- nodemap_addr = nodemap_size = 0;
- return -1;
- }
- memnodemap = phys_to_virt(nodemap_addr);
- memblock_x86_reserve_range(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP");
-
- printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
- nodemap_addr, nodemap_addr + nodemap_size);
- return 0;
-}
-
-/*
- * The LSB of all start and end addresses in the node map is the value of the
- * maximum possible shift.
- */
-static int __init extract_lsb_from_nodes(const struct numa_meminfo *mi)
-{
- int i, nodes_used = 0;
- unsigned long start, end;
- unsigned long bitfield = 0, memtop = 0;
-
- for (i = 0; i < mi->nr_blks; i++) {
- start = mi->blk[i].start;
- end = mi->blk[i].end;
- if (start >= end)
- continue;
- bitfield |= start;
- nodes_used++;
- if (end > memtop)
- memtop = end;
- }
- if (nodes_used <= 1)
- i = 63;
- else
- i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
- memnodemapsize = (memtop >> i)+1;
- return i;
-}
-
-static int __init compute_hash_shift(const struct numa_meminfo *mi)
-{
- int shift;
-
- shift = extract_lsb_from_nodes(mi);
- if (allocate_cachealigned_memnodemap())
- return -1;
- printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
- shift);
-
- if (populate_memnodemap(mi, shift) != 1) {
- printk(KERN_INFO "Your memory is not aligned you need to "
- "rebuild your kernel with a bigger NODEMAPSIZE "
- "shift=%d\n", shift);
- return -1;
- }
- return shift;
-}
-
-int __meminit __early_pfn_to_nid(unsigned long pfn)
-{
- return phys_to_nid(pfn << PAGE_SHIFT);
-}
-
-static void * __init early_node_mem(int nodeid, unsigned long start,
- unsigned long end, unsigned long size,
- unsigned long align)
-{
- unsigned long mem;
-
- /*
- * put it on high as possible
- * something will go with NODE_DATA
- */
- if (start < (MAX_DMA_PFN<<PAGE_SHIFT))
- start = MAX_DMA_PFN<<PAGE_SHIFT;
- if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) &&
- end > (MAX_DMA32_PFN<<PAGE_SHIFT))
- start = MAX_DMA32_PFN<<PAGE_SHIFT;
- mem = memblock_x86_find_in_range_node(nodeid, start, end, size, align);
- if (mem != MEMBLOCK_ERROR)
- return __va(mem);
-
- /* extend the search scope */
- end = max_pfn_mapped << PAGE_SHIFT;
- start = MAX_DMA_PFN << PAGE_SHIFT;
- mem = memblock_find_in_range(start, end, size, align);
- if (mem != MEMBLOCK_ERROR)
- return __va(mem);
-
- printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
- size, nodeid);
-
- return NULL;
-}
-
-static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
- struct numa_meminfo *mi)
-{
- /* ignore zero length blks */
- if (start == end)
- return 0;
-
- /* whine about and ignore invalid blks */
- if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
- pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
- nid, start, end);
- return 0;
- }
-
- if (mi->nr_blks >= NR_NODE_MEMBLKS) {
- pr_err("NUMA: too many memblk ranges\n");
- return -EINVAL;
- }
-
- mi->blk[mi->nr_blks].start = start;
- mi->blk[mi->nr_blks].end = end;
- mi->blk[mi->nr_blks].nid = nid;
- mi->nr_blks++;
- return 0;
-}
-
-/**
- * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
- * @idx: Index of memblk to remove
- * @mi: numa_meminfo to remove memblk from
- *
- * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
- * decrementing @mi->nr_blks.
- */
-void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
-{
- mi->nr_blks--;
- memmove(&mi->blk[idx], &mi->blk[idx + 1],
- (mi->nr_blks - idx) * sizeof(mi->blk[0]));
-}
-
-/**
- * numa_add_memblk - Add one numa_memblk to numa_meminfo
- * @nid: NUMA node ID of the new memblk
- * @start: Start address of the new memblk
- * @end: End address of the new memblk
- *
- * Add a new memblk to the default numa_meminfo.
- *
- * RETURNS:
- * 0 on success, -errno on failure.
- */
-int __init numa_add_memblk(int nid, u64 start, u64 end)
-{
- return numa_add_memblk_to(nid, start, end, &numa_meminfo);
-}
-
-/* Initialize bootmem allocator for a node */
-void __init
-setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
-{
- unsigned long start_pfn, last_pfn, nodedata_phys;
- const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
- int nid;
-
- if (!end)
- return;
-
- /*
- * Don't confuse VM with a node that doesn't have the
- * minimum amount of memory:
- */
- if (end && (end - start) < NODE_MIN_SIZE)
- return;
-
- start = roundup(start, ZONE_ALIGN);
-
- printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", nodeid,
- start, end);
-
- start_pfn = start >> PAGE_SHIFT;
- last_pfn = end >> PAGE_SHIFT;
-
- node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
- SMP_CACHE_BYTES);
- if (node_data[nodeid] == NULL)
- return;
- nodedata_phys = __pa(node_data[nodeid]);
- memblock_x86_reserve_range(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");
- printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
- nodedata_phys + pgdat_size - 1);
- nid = phys_to_nid(nodedata_phys);
- if (nid != nodeid)
- printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
-
- memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
- NODE_DATA(nodeid)->node_id = nodeid;
- NODE_DATA(nodeid)->node_start_pfn = start_pfn;
- NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
-
- node_set_online(nodeid);
-}
-
-/**
- * numa_cleanup_meminfo - Cleanup a numa_meminfo
- * @mi: numa_meminfo to clean up
- *
- * Sanitize @mi by merging and removing unncessary memblks. Also check for
- * conflicts and clear unused memblks.
- *
- * RETURNS:
- * 0 on success, -errno on failure.
- */
-int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
-{
- const u64 low = 0;
- const u64 high = (u64)max_pfn << PAGE_SHIFT;
- int i, j, k;
-
- for (i = 0; i < mi->nr_blks; i++) {
- struct numa_memblk *bi = &mi->blk[i];
-
- /* make sure all blocks are inside the limits */
- bi->start = max(bi->start, low);
- bi->end = min(bi->end, high);
-
- /* and there's no empty block */
- if (bi->start == bi->end) {
- numa_remove_memblk_from(i--, mi);
- continue;
- }
-
- for (j = i + 1; j < mi->nr_blks; j++) {
- struct numa_memblk *bj = &mi->blk[j];
- unsigned long start, end;
-
- /*
- * See whether there are overlapping blocks. Whine
- * about but allow overlaps of the same nid. They
- * will be merged below.
- */
- if (bi->end > bj->start && bi->start < bj->end) {
- if (bi->nid != bj->nid) {
- pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
- bi->nid, bi->start, bi->end,
- bj->nid, bj->start, bj->end);
- return -EINVAL;
- }
- pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
- bi->nid, bi->start, bi->end,
- bj->start, bj->end);
- }
-
- /*
- * Join together blocks on the same node, holes
- * between which don't overlap with memory on other
- * nodes.
- */
- if (bi->nid != bj->nid)
- continue;
- start = max(min(bi->start, bj->start), low);
- end = min(max(bi->end, bj->end), high);
- for (k = 0; k < mi->nr_blks; k++) {
- struct numa_memblk *bk = &mi->blk[k];
-
- if (bi->nid == bk->nid)
- continue;
- if (start < bk->end && end > bk->start)
- break;
- }
- if (k < mi->nr_blks)
- continue;
- printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
- bi->nid, bi->start, bi->end, bj->start, bj->end,
- start, end);
- bi->start = start;
- bi->end = end;
- numa_remove_memblk_from(j--, mi);
- }
- }
-
- for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
- mi->blk[i].start = mi->blk[i].end = 0;
- mi->blk[i].nid = NUMA_NO_NODE;
- }
-
- return 0;
-}
-
-/*
- * Set nodes, which have memory in @mi, in *@nodemask.
- */
-static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
- const struct numa_meminfo *mi)
-{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
- if (mi->blk[i].start != mi->blk[i].end &&
- mi->blk[i].nid != NUMA_NO_NODE)
- node_set(mi->blk[i].nid, *nodemask);
-}
-
-/**
- * numa_reset_distance - Reset NUMA distance table
- *
- * The current table is freed. The next numa_set_distance() call will
- * create a new one.
- */
-void __init numa_reset_distance(void)
-{
- size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
-
- /* numa_distance could be 1LU marking allocation failure, test cnt */
- if (numa_distance_cnt)
- memblock_x86_free_range(__pa(numa_distance),
- __pa(numa_distance) + size);
- numa_distance_cnt = 0;
- numa_distance = NULL; /* enable table creation */
-}
-
-static int __init numa_alloc_distance(void)
-{
- nodemask_t nodes_parsed;
- size_t size;
- int i, j, cnt = 0;
- u64 phys;
-
- /* size the new table and allocate it */
- nodes_parsed = numa_nodes_parsed;
- numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
-
- for_each_node_mask(i, nodes_parsed)
- cnt = i;
- cnt++;
- size = cnt * cnt * sizeof(numa_distance[0]);
-
- phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT,
- size, PAGE_SIZE);
- if (phys == MEMBLOCK_ERROR) {
- pr_warning("NUMA: Warning: can't allocate distance table!\n");
- /* don't retry until explicitly reset */
- numa_distance = (void *)1LU;
- return -ENOMEM;
- }
- memblock_x86_reserve_range(phys, phys + size, "NUMA DIST");
-
- numa_distance = __va(phys);
- numa_distance_cnt = cnt;
-
- /* fill with the default distances */
- for (i = 0; i < cnt; i++)
- for (j = 0; j < cnt; j++)
- numa_distance[i * cnt + j] = i == j ?
- LOCAL_DISTANCE : REMOTE_DISTANCE;
- printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
-
- return 0;
-}
-
-/**
- * numa_set_distance - Set NUMA distance from one NUMA to another
- * @from: the 'from' node to set distance
- * @to: the 'to' node to set distance
- * @distance: NUMA distance
- *
- * Set the distance from node @from to @to to @distance. If distance table
- * doesn't exist, one which is large enough to accommodate all the currently
- * known nodes will be created.
- *
- * If such table cannot be allocated, a warning is printed and further
- * calls are ignored until the distance table is reset with
- * numa_reset_distance().
- *
- * If @from or @to is higher than the highest known node at the time of
- * table creation or @distance doesn't make sense, the call is ignored.
- * This is to allow simplification of specific NUMA config implementations.
- */
-void __init numa_set_distance(int from, int to, int distance)
-{
- if (!numa_distance && numa_alloc_distance() < 0)
- return;
-
- if (from >= numa_distance_cnt || to >= numa_distance_cnt) {
- printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
- from, to, distance);
- return;
- }
-
- if ((u8)distance != distance ||
- (from == to && distance != LOCAL_DISTANCE)) {
- pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
- from, to, distance);
- return;
- }
-
- numa_distance[from * numa_distance_cnt + to] = distance;
-}
-
-int __node_distance(int from, int to)
-{
- if (from >= numa_distance_cnt || to >= numa_distance_cnt)
- return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
- return numa_distance[from * numa_distance_cnt + to];
-}
-EXPORT_SYMBOL(__node_distance);
-
-/*
- * Sanity check to catch more bad NUMA configurations (they are amazingly
- * common). Make sure the nodes cover all memory.
- */
-static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
-{
- unsigned long numaram, e820ram;
- int i;
-
- numaram = 0;
- for (i = 0; i < mi->nr_blks; i++) {
- unsigned long s = mi->blk[i].start >> PAGE_SHIFT;
- unsigned long e = mi->blk[i].end >> PAGE_SHIFT;
- numaram += e - s;
- numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
- if ((long)numaram < 0)
- numaram = 0;
- }
-
- e820ram = max_pfn - (memblock_x86_hole_size(0,
- max_pfn << PAGE_SHIFT) >> PAGE_SHIFT);
- /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
- if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
- printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n",
- (numaram << PAGE_SHIFT) >> 20,
- (e820ram << PAGE_SHIFT) >> 20);
- return false;
- }
- return true;
-}
-
-static int __init numa_register_memblks(struct numa_meminfo *mi)
-{
- int i, nid;
-
- /* Account for nodes with cpus and no memory */
- node_possible_map = numa_nodes_parsed;
- numa_nodemask_from_meminfo(&node_possible_map, mi);
- if (WARN_ON(nodes_empty(node_possible_map)))
- return -EINVAL;
-
- memnode_shift = compute_hash_shift(mi);
- if (memnode_shift < 0) {
- printk(KERN_ERR "NUMA: No NUMA node hash function found. Contact maintainer\n");
- return -EINVAL;
- }
-
- for (i = 0; i < mi->nr_blks; i++)
- memblock_x86_register_active_regions(mi->blk[i].nid,
- mi->blk[i].start >> PAGE_SHIFT,
- mi->blk[i].end >> PAGE_SHIFT);
-
- /* for out of order entries */
- sort_node_map();
- if (!numa_meminfo_cover_memory(mi))
- return -EINVAL;
-
- /* Finally register nodes. */
- for_each_node_mask(nid, node_possible_map) {
- u64 start = (u64)max_pfn << PAGE_SHIFT;
- u64 end = 0;
-
- for (i = 0; i < mi->nr_blks; i++) {
- if (nid != mi->blk[i].nid)
- continue;
- start = min(mi->blk[i].start, start);
- end = max(mi->blk[i].end, end);
- }
-
- if (start < end)
- setup_node_bootmem(nid, start, end);
- }
-
- return 0;
-}
-
-/**
- * dummy_numma_init - Fallback dummy NUMA init
- *
- * Used if there's no underlying NUMA architecture, NUMA initialization
- * fails, or NUMA is disabled on the command line.
- *
- * Must online at least one node and add memory blocks that cover all
- * allowed memory. This function must not fail.
- */
-static int __init dummy_numa_init(void)
-{
- printk(KERN_INFO "%s\n",
- numa_off ? "NUMA turned off" : "No NUMA configuration found");
- printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
- 0LU, max_pfn << PAGE_SHIFT);
-
- node_set(0, numa_nodes_parsed);
- numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT);
-
- return 0;
-}
-
-static int __init numa_init(int (*init_func)(void))
-{
- int i;
- int ret;
-
- for (i = 0; i < MAX_LOCAL_APIC; i++)
- set_apicid_to_node(i, NUMA_NO_NODE);
-
- nodes_clear(numa_nodes_parsed);
- nodes_clear(node_possible_map);
- nodes_clear(node_online_map);
- memset(&numa_meminfo, 0, sizeof(numa_meminfo));
- remove_all_active_ranges();
- numa_reset_distance();
-
- ret = init_func();
- if (ret < 0)
- return ret;
- ret = numa_cleanup_meminfo(&numa_meminfo);
- if (ret < 0)
- return ret;
-
- numa_emulation(&numa_meminfo, numa_distance_cnt);
-
- ret = numa_register_memblks(&numa_meminfo);
- if (ret < 0)
- return ret;
-
- for (i = 0; i < nr_cpu_ids; i++) {
- int nid = early_cpu_to_node(i);
-
- if (nid == NUMA_NO_NODE)
- continue;
- if (!node_online(nid))
- numa_clear_node(i);
- }
- numa_init_array();
- return 0;
-}
-
void __init initmem_init(void)
{
- int ret;
-
- if (!numa_off) {
-#ifdef CONFIG_ACPI_NUMA
- ret = numa_init(x86_acpi_numa_init);
- if (!ret)
- return;
-#endif
-#ifdef CONFIG_AMD_NUMA
- ret = numa_init(amd_numa_init);
- if (!ret)
- return;
-#endif
- }
-
- numa_init(dummy_numa_init);
+ x86_numa_init();
}
unsigned long __init numa_free_all_bootmem(void)
@@ -656,12 +23,3 @@ unsigned long __init numa_free_all_bootmem(void)
return pages;
}
-
-int __cpuinit numa_cpu_node(int cpu)
-{
- int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
-
- if (apicid != BAD_APICID)
- return __apicid_to_node[apicid];
- return NUMA_NO_NODE;
-}
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index ad091e4cff1..d0ed086b624 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -5,6 +5,7 @@
#include <linux/errno.h>
#include <linux/topology.h>
#include <linux/memblock.h>
+#include <linux/bootmem.h>
#include <asm/dma.h>
#include "numa_internal.h"
@@ -84,7 +85,13 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
nr_nodes = MAX_NUMNODES;
}
- size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes;
+ /*
+ * Calculate target node size. x86_32 freaks on __udivdi3() so do
+ * the division in ulong number of pages and convert back.
+ */
+ size = max_addr - addr - memblock_x86_hole_size(addr, max_addr);
+ size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
+
/*
* Calculate the number of big nodes that can be allocated as a result
* of consolidating the remainder.
@@ -226,7 +233,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
*/
while (nodes_weight(physnode_mask)) {
for_each_node_mask(i, physnode_mask) {
- u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
+ u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
u64 start, limit, end;
int phys_blk;
@@ -298,7 +305,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
{
static struct numa_meminfo ei __initdata;
static struct numa_meminfo pi __initdata;
- const u64 max_addr = max_pfn << PAGE_SHIFT;
+ const u64 max_addr = PFN_PHYS(max_pfn);
u8 *phys_dist = NULL;
size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
int max_emu_nid, dfl_phys_nid;
@@ -342,8 +349,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
if (numa_dist_cnt) {
u64 phys;
- phys = memblock_find_in_range(0,
- (u64)max_pfn_mapped << PAGE_SHIFT,
+ phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
phys_size, PAGE_SIZE);
if (phys == MEMBLOCK_ERROR) {
pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
@@ -454,10 +460,9 @@ void __cpuinit numa_remove_cpu(int cpu)
cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
}
#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
-static void __cpuinit numa_set_cpumask(int cpu, int enable)
+static void __cpuinit numa_set_cpumask(int cpu, bool enable)
{
- struct cpumask *mask;
- int nid, physnid, i;
+ int nid, physnid;
nid = early_cpu_to_node(cpu);
if (nid == NUMA_NO_NODE) {
@@ -467,28 +472,21 @@ static void __cpuinit numa_set_cpumask(int cpu, int enable)
physnid = emu_nid_to_phys[nid];
- for_each_online_node(i) {
+ for_each_online_node(nid) {
if (emu_nid_to_phys[nid] != physnid)
continue;
- mask = debug_cpumask_set_cpu(cpu, enable);
- if (!mask)
- return;
-
- if (enable)
- cpumask_set_cpu(cpu, mask);
- else
- cpumask_clear_cpu(cpu, mask);
+ debug_cpumask_set_cpu(cpu, nid, enable);
}
}
void __cpuinit numa_add_cpu(int cpu)
{
- numa_set_cpumask(cpu, 1);
+ numa_set_cpumask(cpu, true);
}
void __cpuinit numa_remove_cpu(int cpu)
{
- numa_set_cpumask(cpu, 0);
+ numa_set_cpumask(cpu, false);
}
#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
index ef2d97377d7..7178c3afe05 100644
--- a/arch/x86/mm/numa_internal.h
+++ b/arch/x86/mm/numa_internal.h
@@ -19,6 +19,14 @@ void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi);
int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
void __init numa_reset_distance(void);
+void __init x86_numa_init(void);
+
+#ifdef CONFIG_X86_64
+static inline void init_alloc_remap(int nid, u64 start, u64 end) { }
+#else
+void __init init_alloc_remap(int nid, u64 start, u64 end);
+#endif
+
#ifdef CONFIG_NUMA_EMU
void __init numa_emulation(struct numa_meminfo *numa_meminfo,
int numa_dist_cnt);
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index e1d10690921..b0086567271 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -123,12 +123,11 @@ static int pageattr_test(void)
if (print)
printk(KERN_INFO "CPA self-test:\n");
- bm = vmalloc((max_pfn_mapped + 7) / 8);
+ bm = vzalloc((max_pfn_mapped + 7) / 8);
if (!bm) {
printk(KERN_ERR "CPA Cannot vmalloc bitmap\n");
return -ENOMEM;
}
- memset(bm, 0, (max_pfn_mapped + 7) / 8);
failed += print_split(&sa);
srandom32(100);
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
index 38e6d174c49..9f0614daea8 100644
--- a/arch/x86/mm/pf_in.c
+++ b/arch/x86/mm/pf_in.c
@@ -414,22 +414,17 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
unsigned char *p;
struct prefix_bits prf;
int i;
- unsigned long rv;
p = (unsigned char *)ins_addr;
p += skip_prefix(p, &prf);
p += get_opcode(p, &opcode);
for (i = 0; i < ARRAY_SIZE(reg_rop); i++)
- if (reg_rop[i] == opcode) {
- rv = REG_READ;
+ if (reg_rop[i] == opcode)
goto do_work;
- }
for (i = 0; i < ARRAY_SIZE(reg_wop); i++)
- if (reg_wop[i] == opcode) {
- rv = REG_WRITE;
+ if (reg_wop[i] == opcode)
goto do_work;
- }
printk(KERN_ERR "mmiotrace: Not a register instruction, opcode "
"0x%02x\n", opcode);
@@ -474,16 +469,13 @@ unsigned long get_ins_imm_val(unsigned long ins_addr)
unsigned char *p;
struct prefix_bits prf;
int i;
- unsigned long rv;
p = (unsigned char *)ins_addr;
p += skip_prefix(p, &prf);
p += get_opcode(p, &opcode);
for (i = 0; i < ARRAY_SIZE(imm_wop); i++)
- if (imm_wop[i] == opcode) {
- rv = IMM_WRITE;
+ if (imm_wop[i] == opcode)
goto do_work;
- }
printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode "
"0x%02x\n", opcode);
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat.c
index 8e9d3394f6d..81dbfdeb080 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat.c
@@ -26,8 +26,6 @@
int acpi_numa __initdata;
-static struct bootnode nodes_add[MAX_NUMNODES];
-
static __init int setup_node(int pxm)
{
return acpi_map_pxm_to_node(pxm);
@@ -37,7 +35,6 @@ static __init void bad_srat(void)
{
printk(KERN_ERR "SRAT: SRAT not used.\n");
acpi_numa = -1;
- memset(nodes_add, 0, sizeof(nodes_add));
}
static __init inline int srat_disabled(void)
@@ -131,73 +128,17 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
pxm, apic_id, node);
}
-#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+#ifdef CONFIG_MEMORY_HOTPLUG
static inline int save_add_info(void) {return 1;}
#else
static inline int save_add_info(void) {return 0;}
#endif
-/*
- * Update nodes_add[]
- * This code supports one contiguous hot add area per node
- */
-static void __init
-update_nodes_add(int node, unsigned long start, unsigned long end)
-{
- unsigned long s_pfn = start >> PAGE_SHIFT;
- unsigned long e_pfn = end >> PAGE_SHIFT;
- int changed = 0;
- struct bootnode *nd = &nodes_add[node];
-
- /* I had some trouble with strange memory hotadd regions breaking
- the boot. Be very strict here and reject anything unexpected.
- If you want working memory hotadd write correct SRATs.
-
- The node size check is a basic sanity check to guard against
- mistakes */
- if ((signed long)(end - start) < NODE_MIN_SIZE) {
- printk(KERN_ERR "SRAT: Hotplug area too small\n");
- return;
- }
-
- /* This check might be a bit too strict, but I'm keeping it for now. */
- if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
- printk(KERN_ERR
- "SRAT: Hotplug area %lu -> %lu has existing memory\n",
- s_pfn, e_pfn);
- return;
- }
-
- /* Looks good */
-
- if (nd->start == nd->end) {
- nd->start = start;
- nd->end = end;
- changed = 1;
- } else {
- if (nd->start == end) {
- nd->start = start;
- changed = 1;
- }
- if (nd->end == start) {
- nd->end = end;
- changed = 1;
- }
- if (!changed)
- printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
- }
-
- if (changed) {
- node_set(node, numa_nodes_parsed);
- printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
- nd->start, nd->end);
- }
-}
/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
void __init
acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
{
- unsigned long start, end;
+ u64 start, end;
int node, pxm;
if (srat_disabled())
@@ -226,11 +167,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
return;
}
- printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
+ printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
start, end);
-
- if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)
- update_nodes_add(node, start, end);
}
void __init acpi_numa_arch_fixup(void) {}
@@ -244,17 +182,3 @@ int __init x86_acpi_numa_init(void)
return ret;
return srat_disabled() ? -EINVAL : 0;
}
-
-#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
-int memory_add_physaddr_to_nid(u64 start)
-{
- int i, ret = 0;
-
- for_each_node(i)
- if (nodes_add[i].start <= start && nodes_add[i].end > start)
- ret = i;
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
-#endif
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
deleted file mode 100644
index 364f36bdfad..00000000000
--- a/arch/x86/mm/srat_32.c
+++ /dev/null
@@ -1,288 +0,0 @@
-/*
- * Some of the code in this file has been gleaned from the 64 bit
- * discontigmem support code base.
- *
- * Copyright (C) 2002, IBM Corp.
- *
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to Pat Gaughen <gone@us.ibm.com>
- */
-#include <linux/mm.h>
-#include <linux/bootmem.h>
-#include <linux/memblock.h>
-#include <linux/mmzone.h>
-#include <linux/acpi.h>
-#include <linux/nodemask.h>
-#include <asm/srat.h>
-#include <asm/topology.h>
-#include <asm/smp.h>
-#include <asm/e820.h>
-
-/*
- * proximity macros and definitions
- */
-#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */
-#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */
-#define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit))
-#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
-/* bitmap length; _PXM is at most 255 */
-#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8)
-static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */
-
-#define MAX_CHUNKS_PER_NODE 3
-#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
-struct node_memory_chunk_s {
- unsigned long start_pfn;
- unsigned long end_pfn;
- u8 pxm; // proximity domain of node
- u8 nid; // which cnode contains this chunk?
- u8 bank; // which mem bank on this node
-};
-static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
-
-static int __initdata num_memory_chunks; /* total number of memory chunks */
-static u8 __initdata apicid_to_pxm[MAX_LOCAL_APIC];
-
-int acpi_numa __initdata;
-
-static __init void bad_srat(void)
-{
- printk(KERN_ERR "SRAT: SRAT not used.\n");
- acpi_numa = -1;
- num_memory_chunks = 0;
-}
-
-static __init inline int srat_disabled(void)
-{
- return numa_off || acpi_numa < 0;
-}
-
-/* Identify CPU proximity domains */
-void __init
-acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
-{
- if (srat_disabled())
- return;
- if (cpu_affinity->header.length !=
- sizeof(struct acpi_srat_cpu_affinity)) {
- bad_srat();
- return;
- }
-
- if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
- return; /* empty entry */
-
- /* mark this node as "seen" in node bitmap */
- BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
-
- /* don't need to check apic_id here, because it is always 8 bits */
- apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
-
- printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
- cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo);
-}
-
-/*
- * Identify memory proximity domains and hot-remove capabilities.
- * Fill node memory chunk list structure.
- */
-void __init
-acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity)
-{
- unsigned long long paddr, size;
- unsigned long start_pfn, end_pfn;
- u8 pxm;
- struct node_memory_chunk_s *p, *q, *pend;
-
- if (srat_disabled())
- return;
- if (memory_affinity->header.length !=
- sizeof(struct acpi_srat_mem_affinity)) {
- bad_srat();
- return;
- }
-
- if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
- return; /* empty entry */
-
- pxm = memory_affinity->proximity_domain & 0xff;
-
- /* mark this node as "seen" in node bitmap */
- BMAP_SET(pxm_bitmap, pxm);
-
- /* calculate info for memory chunk structure */
- paddr = memory_affinity->base_address;
- size = memory_affinity->length;
-
- start_pfn = paddr >> PAGE_SHIFT;
- end_pfn = (paddr + size) >> PAGE_SHIFT;
-
-
- if (num_memory_chunks >= MAXCHUNKS) {
- printk(KERN_WARNING "Too many mem chunks in SRAT."
- " Ignoring %lld MBytes at %llx\n",
- size/(1024*1024), paddr);
- return;
- }
-
- /* Insertion sort based on base address */
- pend = &node_memory_chunk[num_memory_chunks];
- for (p = &node_memory_chunk[0]; p < pend; p++) {
- if (start_pfn < p->start_pfn)
- break;
- }
- if (p < pend) {
- for (q = pend; q >= p; q--)
- *(q + 1) = *q;
- }
- p->start_pfn = start_pfn;
- p->end_pfn = end_pfn;
- p->pxm = pxm;
-
- num_memory_chunks++;
-
- printk(KERN_DEBUG "Memory range %08lx to %08lx"
- " in proximity domain %02x %s\n",
- start_pfn, end_pfn,
- pxm,
- ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
- "enabled and removable" : "enabled" ) );
-}
-
-/* Callback for SLIT parsing */
-void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
-{
-}
-
-void acpi_numa_arch_fixup(void)
-{
-}
-/*
- * The SRAT table always lists ascending addresses, so can always
- * assume that the first "start" address that you see is the real
- * start of the node, and that the current "end" address is after
- * the previous one.
- */
-static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
-{
- /*
- * Only add present memory as told by the e820.
- * There is no guarantee from the SRAT that the memory it
- * enumerates is present at boot time because it represents
- * *possible* memory hotplug areas the same as normal RAM.
- */
- if (memory_chunk->start_pfn >= max_pfn) {
- printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n",
- memory_chunk->start_pfn, memory_chunk->end_pfn);
- return -1;
- }
- if (memory_chunk->nid != nid)
- return -1;
-
- if (!node_has_online_mem(nid))
- node_start_pfn[nid] = memory_chunk->start_pfn;
-
- if (node_start_pfn[nid] > memory_chunk->start_pfn)
- node_start_pfn[nid] = memory_chunk->start_pfn;
-
- if (node_end_pfn[nid] < memory_chunk->end_pfn)
- node_end_pfn[nid] = memory_chunk->end_pfn;
-
- return 0;
-}
-
-int __init get_memcfg_from_srat(void)
-{
- int i, j, nid;
-
- if (srat_disabled())
- goto out_fail;
-
- if (acpi_numa_init() < 0)
- goto out_fail;
-
- if (num_memory_chunks == 0) {
- printk(KERN_DEBUG
- "could not find any ACPI SRAT memory areas.\n");
- goto out_fail;
- }
-
- /* Calculate total number of nodes in system from PXM bitmap and create
- * a set of sequential node IDs starting at zero. (ACPI doesn't seem
- * to specify the range of _PXM values.)
- */
- /*
- * MCD - we no longer HAVE to number nodes sequentially. PXM domain
- * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically
- * 32, so we will continue numbering them in this manner until MAX_NUMNODES
- * approaches MAX_PXM_DOMAINS for i386.
- */
- nodes_clear(node_online_map);
- for (i = 0; i < MAX_PXM_DOMAINS; i++) {
- if (BMAP_TEST(pxm_bitmap, i)) {
- int nid = acpi_map_pxm_to_node(i);
- node_set_online(nid);
- }
- }
- BUG_ON(num_online_nodes() == 0);
-
- /* set cnode id in memory chunk structure */
- for (i = 0; i < num_memory_chunks; i++)
- node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
-
- printk(KERN_DEBUG "pxm bitmap: ");
- for (i = 0; i < sizeof(pxm_bitmap); i++) {
- printk(KERN_CONT "%02x ", pxm_bitmap[i]);
- }
- printk(KERN_CONT "\n");
- printk(KERN_DEBUG "Number of logical nodes in system = %d\n",
- num_online_nodes());
- printk(KERN_DEBUG "Number of memory chunks in system = %d\n",
- num_memory_chunks);
-
- for (i = 0; i < MAX_LOCAL_APIC; i++)
- set_apicid_to_node(i, pxm_to_node(apicid_to_pxm[i]));
-
- for (j = 0; j < num_memory_chunks; j++){
- struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
- printk(KERN_DEBUG
- "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
- j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
- if (node_read_chunk(chunk->nid, chunk))
- continue;
-
- memblock_x86_register_active_regions(chunk->nid, chunk->start_pfn,
- min(chunk->end_pfn, max_pfn));
- }
- /* for out of order entries in SRAT */
- sort_node_map();
-
- for_each_online_node(nid) {
- unsigned long start = node_start_pfn[nid];
- unsigned long end = min(node_end_pfn[nid], max_pfn);
-
- memory_present(nid, start, end);
- node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
- }
- return 1;
-out_fail:
- printk(KERN_DEBUG "failed to get NUMA memory information from SRAT"
- " table\n");
- return 0;
-}
diff --git a/arch/x86/net/Makefile b/arch/x86/net/Makefile
new file mode 100644
index 00000000000..90568c33ddb
--- /dev/null
+++ b/arch/x86/net/Makefile
@@ -0,0 +1,4 @@
+#
+# Arch-specific network modules
+#
+obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o
diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S
new file mode 100644
index 00000000000..66870223f8c
--- /dev/null
+++ b/arch/x86/net/bpf_jit.S
@@ -0,0 +1,140 @@
+/* bpf_jit.S : BPF JIT helper functions
+ *
+ * Copyright (C) 2011 Eric Dumazet (eric.dumazet@gmail.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+/*
+ * Calling convention :
+ * rdi : skb pointer
+ * esi : offset of byte(s) to fetch in skb (can be scratched)
+ * r8 : copy of skb->data
+ * r9d : hlen = skb->len - skb->data_len
+ */
+#define SKBDATA %r8
+
+sk_load_word_ind:
+ .globl sk_load_word_ind
+
+ add %ebx,%esi /* offset += X */
+# test %esi,%esi /* if (offset < 0) goto bpf_error; */
+ js bpf_error
+
+sk_load_word:
+ .globl sk_load_word
+
+ mov %r9d,%eax # hlen
+ sub %esi,%eax # hlen - offset
+ cmp $3,%eax
+ jle bpf_slow_path_word
+ mov (SKBDATA,%rsi),%eax
+ bswap %eax /* ntohl() */
+ ret
+
+
+sk_load_half_ind:
+ .globl sk_load_half_ind
+
+ add %ebx,%esi /* offset += X */
+ js bpf_error
+
+sk_load_half:
+ .globl sk_load_half
+
+ mov %r9d,%eax
+ sub %esi,%eax # hlen - offset
+ cmp $1,%eax
+ jle bpf_slow_path_half
+ movzwl (SKBDATA,%rsi),%eax
+ rol $8,%ax # ntohs()
+ ret
+
+sk_load_byte_ind:
+ .globl sk_load_byte_ind
+ add %ebx,%esi /* offset += X */
+ js bpf_error
+
+sk_load_byte:
+ .globl sk_load_byte
+
+ cmp %esi,%r9d /* if (offset >= hlen) goto bpf_slow_path_byte */
+ jle bpf_slow_path_byte
+ movzbl (SKBDATA,%rsi),%eax
+ ret
+
+/**
+ * sk_load_byte_msh - BPF_S_LDX_B_MSH helper
+ *
+ * Implements BPF_S_LDX_B_MSH : ldxb 4*([offset]&0xf)
+ * Must preserve A accumulator (%eax)
+ * Inputs : %esi is the offset value, already known positive
+ */
+ENTRY(sk_load_byte_msh)
+ CFI_STARTPROC
+ cmp %esi,%r9d /* if (offset >= hlen) goto bpf_slow_path_byte_msh */
+ jle bpf_slow_path_byte_msh
+ movzbl (SKBDATA,%rsi),%ebx
+ and $15,%bl
+ shl $2,%bl
+ ret
+ CFI_ENDPROC
+ENDPROC(sk_load_byte_msh)
+
+bpf_error:
+# force a return 0 from jit handler
+ xor %eax,%eax
+ mov -8(%rbp),%rbx
+ leaveq
+ ret
+
+/* rsi contains offset and can be scratched */
+#define bpf_slow_path_common(LEN) \
+ push %rdi; /* save skb */ \
+ push %r9; \
+ push SKBDATA; \
+/* rsi already has offset */ \
+ mov $LEN,%ecx; /* len */ \
+ lea -12(%rbp),%rdx; \
+ call skb_copy_bits; \
+ test %eax,%eax; \
+ pop SKBDATA; \
+ pop %r9; \
+ pop %rdi
+
+
+bpf_slow_path_word:
+ bpf_slow_path_common(4)
+ js bpf_error
+ mov -12(%rbp),%eax
+ bswap %eax
+ ret
+
+bpf_slow_path_half:
+ bpf_slow_path_common(2)
+ js bpf_error
+ mov -12(%rbp),%ax
+ rol $8,%ax
+ movzwl %ax,%eax
+ ret
+
+bpf_slow_path_byte:
+ bpf_slow_path_common(1)
+ js bpf_error
+ movzbl -12(%rbp),%eax
+ ret
+
+bpf_slow_path_byte_msh:
+ xchg %eax,%ebx /* dont lose A , X is about to be scratched */
+ bpf_slow_path_common(1)
+ js bpf_error
+ movzbl -12(%rbp),%eax
+ and $15,%al
+ shl $2,%al
+ xchg %eax,%ebx
+ ret
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
new file mode 100644
index 00000000000..bfab3fa10ed
--- /dev/null
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -0,0 +1,654 @@
+/* bpf_jit_comp.c : BPF JIT compiler
+ *
+ * Copyright (C) 2011 Eric Dumazet (eric.dumazet@gmail.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/moduleloader.h>
+#include <asm/cacheflush.h>
+#include <linux/netdevice.h>
+#include <linux/filter.h>
+
+/*
+ * Conventions :
+ * EAX : BPF A accumulator
+ * EBX : BPF X accumulator
+ * RDI : pointer to skb (first argument given to JIT function)
+ * RBP : frame pointer (even if CONFIG_FRAME_POINTER=n)
+ * ECX,EDX,ESI : scratch registers
+ * r9d : skb->len - skb->data_len (headlen)
+ * r8 : skb->data
+ * -8(RBP) : saved RBX value
+ * -16(RBP)..-80(RBP) : BPF_MEMWORDS values
+ */
+int bpf_jit_enable __read_mostly;
+
+/*
+ * assembly code in arch/x86/net/bpf_jit.S
+ */
+extern u8 sk_load_word[], sk_load_half[], sk_load_byte[], sk_load_byte_msh[];
+extern u8 sk_load_word_ind[], sk_load_half_ind[], sk_load_byte_ind[];
+
+static inline u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
+{
+ if (len == 1)
+ *ptr = bytes;
+ else if (len == 2)
+ *(u16 *)ptr = bytes;
+ else {
+ *(u32 *)ptr = bytes;
+ barrier();
+ }
+ return ptr + len;
+}
+
+#define EMIT(bytes, len) do { prog = emit_code(prog, bytes, len); } while (0)
+
+#define EMIT1(b1) EMIT(b1, 1)
+#define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2)
+#define EMIT3(b1, b2, b3) EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3)
+#define EMIT4(b1, b2, b3, b4) EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4)
+#define EMIT1_off32(b1, off) do { EMIT1(b1); EMIT(off, 4);} while (0)
+
+#define CLEAR_A() EMIT2(0x31, 0xc0) /* xor %eax,%eax */
+#define CLEAR_X() EMIT2(0x31, 0xdb) /* xor %ebx,%ebx */
+
+static inline bool is_imm8(int value)
+{
+ return value <= 127 && value >= -128;
+}
+
+static inline bool is_near(int offset)
+{
+ return offset <= 127 && offset >= -128;
+}
+
+#define EMIT_JMP(offset) \
+do { \
+ if (offset) { \
+ if (is_near(offset)) \
+ EMIT2(0xeb, offset); /* jmp .+off8 */ \
+ else \
+ EMIT1_off32(0xe9, offset); /* jmp .+off32 */ \
+ } \
+} while (0)
+
+/* list of x86 cond jumps opcodes (. + s8)
+ * Add 0x10 (and an extra 0x0f) to generate far jumps (. + s32)
+ */
+#define X86_JB 0x72
+#define X86_JAE 0x73
+#define X86_JE 0x74
+#define X86_JNE 0x75
+#define X86_JBE 0x76
+#define X86_JA 0x77
+
+#define EMIT_COND_JMP(op, offset) \
+do { \
+ if (is_near(offset)) \
+ EMIT2(op, offset); /* jxx .+off8 */ \
+ else { \
+ EMIT2(0x0f, op + 0x10); \
+ EMIT(offset, 4); /* jxx .+off32 */ \
+ } \
+} while (0)
+
+#define COND_SEL(CODE, TOP, FOP) \
+ case CODE: \
+ t_op = TOP; \
+ f_op = FOP; \
+ goto cond_branch
+
+
+#define SEEN_DATAREF 1 /* might call external helpers */
+#define SEEN_XREG 2 /* ebx is used */
+#define SEEN_MEM 4 /* use mem[] for temporary storage */
+
+static inline void bpf_flush_icache(void *start, void *end)
+{
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(KERNEL_DS);
+ smp_wmb();
+ flush_icache_range((unsigned long)start, (unsigned long)end);
+ set_fs(old_fs);
+}
+
+
+void bpf_jit_compile(struct sk_filter *fp)
+{
+ u8 temp[64];
+ u8 *prog;
+ unsigned int proglen, oldproglen = 0;
+ int ilen, i;
+ int t_offset, f_offset;
+ u8 t_op, f_op, seen = 0, pass;
+ u8 *image = NULL;
+ u8 *func;
+ int pc_ret0 = -1; /* bpf index of first RET #0 instruction (if any) */
+ unsigned int cleanup_addr; /* epilogue code offset */
+ unsigned int *addrs;
+ const struct sock_filter *filter = fp->insns;
+ int flen = fp->len;
+
+ if (!bpf_jit_enable)
+ return;
+
+ addrs = kmalloc(flen * sizeof(*addrs), GFP_KERNEL);
+ if (addrs == NULL)
+ return;
+
+ /* Before first pass, make a rough estimation of addrs[]
+ * each bpf instruction is translated to less than 64 bytes
+ */
+ for (proglen = 0, i = 0; i < flen; i++) {
+ proglen += 64;
+ addrs[i] = proglen;
+ }
+ cleanup_addr = proglen; /* epilogue address */
+
+ for (pass = 0; pass < 10; pass++) {
+ /* no prologue/epilogue for trivial filters (RET something) */
+ proglen = 0;
+ prog = temp;
+
+ if (seen) {
+ EMIT4(0x55, 0x48, 0x89, 0xe5); /* push %rbp; mov %rsp,%rbp */
+ EMIT4(0x48, 0x83, 0xec, 96); /* subq $96,%rsp */
+ /* note : must save %rbx in case bpf_error is hit */
+ if (seen & (SEEN_XREG | SEEN_DATAREF))
+ EMIT4(0x48, 0x89, 0x5d, 0xf8); /* mov %rbx, -8(%rbp) */
+ if (seen & SEEN_XREG)
+ CLEAR_X(); /* make sure we dont leek kernel memory */
+
+ /*
+ * If this filter needs to access skb data,
+ * loads r9 and r8 with :
+ * r9 = skb->len - skb->data_len
+ * r8 = skb->data
+ */
+ if (seen & SEEN_DATAREF) {
+ if (offsetof(struct sk_buff, len) <= 127)
+ /* mov off8(%rdi),%r9d */
+ EMIT4(0x44, 0x8b, 0x4f, offsetof(struct sk_buff, len));
+ else {
+ /* mov off32(%rdi),%r9d */
+ EMIT3(0x44, 0x8b, 0x8f);
+ EMIT(offsetof(struct sk_buff, len), 4);
+ }
+ if (is_imm8(offsetof(struct sk_buff, data_len)))
+ /* sub off8(%rdi),%r9d */
+ EMIT4(0x44, 0x2b, 0x4f, offsetof(struct sk_buff, data_len));
+ else {
+ EMIT3(0x44, 0x2b, 0x8f);
+ EMIT(offsetof(struct sk_buff, data_len), 4);
+ }
+
+ if (is_imm8(offsetof(struct sk_buff, data)))
+ /* mov off8(%rdi),%r8 */
+ EMIT4(0x4c, 0x8b, 0x47, offsetof(struct sk_buff, data));
+ else {
+ /* mov off32(%rdi),%r8 */
+ EMIT3(0x4c, 0x8b, 0x87);
+ EMIT(offsetof(struct sk_buff, data), 4);
+ }
+ }
+ }
+
+ switch (filter[0].code) {
+ case BPF_S_RET_K:
+ case BPF_S_LD_W_LEN:
+ case BPF_S_ANC_PROTOCOL:
+ case BPF_S_ANC_IFINDEX:
+ case BPF_S_ANC_MARK:
+ case BPF_S_ANC_RXHASH:
+ case BPF_S_ANC_CPU:
+ case BPF_S_ANC_QUEUE:
+ case BPF_S_LD_W_ABS:
+ case BPF_S_LD_H_ABS:
+ case BPF_S_LD_B_ABS:
+ /* first instruction sets A register (or is RET 'constant') */
+ break;
+ default:
+ /* make sure we dont leak kernel information to user */
+ CLEAR_A(); /* A = 0 */
+ }
+
+ for (i = 0; i < flen; i++) {
+ unsigned int K = filter[i].k;
+
+ switch (filter[i].code) {
+ case BPF_S_ALU_ADD_X: /* A += X; */
+ seen |= SEEN_XREG;
+ EMIT2(0x01, 0xd8); /* add %ebx,%eax */
+ break;
+ case BPF_S_ALU_ADD_K: /* A += K; */
+ if (!K)
+ break;
+ if (is_imm8(K))
+ EMIT3(0x83, 0xc0, K); /* add imm8,%eax */
+ else
+ EMIT1_off32(0x05, K); /* add imm32,%eax */
+ break;
+ case BPF_S_ALU_SUB_X: /* A -= X; */
+ seen |= SEEN_XREG;
+ EMIT2(0x29, 0xd8); /* sub %ebx,%eax */
+ break;
+ case BPF_S_ALU_SUB_K: /* A -= K */
+ if (!K)
+ break;
+ if (is_imm8(K))
+ EMIT3(0x83, 0xe8, K); /* sub imm8,%eax */
+ else
+ EMIT1_off32(0x2d, K); /* sub imm32,%eax */
+ break;
+ case BPF_S_ALU_MUL_X: /* A *= X; */
+ seen |= SEEN_XREG;
+ EMIT3(0x0f, 0xaf, 0xc3); /* imul %ebx,%eax */
+ break;
+ case BPF_S_ALU_MUL_K: /* A *= K */
+ if (is_imm8(K))
+ EMIT3(0x6b, 0xc0, K); /* imul imm8,%eax,%eax */
+ else {
+ EMIT2(0x69, 0xc0); /* imul imm32,%eax */
+ EMIT(K, 4);
+ }
+ break;
+ case BPF_S_ALU_DIV_X: /* A /= X; */
+ seen |= SEEN_XREG;
+ EMIT2(0x85, 0xdb); /* test %ebx,%ebx */
+ if (pc_ret0 != -1)
+ EMIT_COND_JMP(X86_JE, addrs[pc_ret0] - (addrs[i] - 4));
+ else {
+ EMIT_COND_JMP(X86_JNE, 2 + 5);
+ CLEAR_A();
+ EMIT1_off32(0xe9, cleanup_addr - (addrs[i] - 4)); /* jmp .+off32 */
+ }
+ EMIT4(0x31, 0xd2, 0xf7, 0xf3); /* xor %edx,%edx; div %ebx */
+ break;
+ case BPF_S_ALU_DIV_K: /* A = reciprocal_divide(A, K); */
+ EMIT3(0x48, 0x69, 0xc0); /* imul imm32,%rax,%rax */
+ EMIT(K, 4);
+ EMIT4(0x48, 0xc1, 0xe8, 0x20); /* shr $0x20,%rax */
+ break;
+ case BPF_S_ALU_AND_X:
+ seen |= SEEN_XREG;
+ EMIT2(0x21, 0xd8); /* and %ebx,%eax */
+ break;
+ case BPF_S_ALU_AND_K:
+ if (K >= 0xFFFFFF00) {
+ EMIT2(0x24, K & 0xFF); /* and imm8,%al */
+ } else if (K >= 0xFFFF0000) {
+ EMIT2(0x66, 0x25); /* and imm16,%ax */
+ EMIT2(K, 2);
+ } else {
+ EMIT1_off32(0x25, K); /* and imm32,%eax */
+ }
+ break;
+ case BPF_S_ALU_OR_X:
+ seen |= SEEN_XREG;
+ EMIT2(0x09, 0xd8); /* or %ebx,%eax */
+ break;
+ case BPF_S_ALU_OR_K:
+ if (is_imm8(K))
+ EMIT3(0x83, 0xc8, K); /* or imm8,%eax */
+ else
+ EMIT1_off32(0x0d, K); /* or imm32,%eax */
+ break;
+ case BPF_S_ALU_LSH_X: /* A <<= X; */
+ seen |= SEEN_XREG;
+ EMIT4(0x89, 0xd9, 0xd3, 0xe0); /* mov %ebx,%ecx; shl %cl,%eax */
+ break;
+ case BPF_S_ALU_LSH_K:
+ if (K == 0)
+ break;
+ else if (K == 1)
+ EMIT2(0xd1, 0xe0); /* shl %eax */
+ else
+ EMIT3(0xc1, 0xe0, K);
+ break;
+ case BPF_S_ALU_RSH_X: /* A >>= X; */
+ seen |= SEEN_XREG;
+ EMIT4(0x89, 0xd9, 0xd3, 0xe8); /* mov %ebx,%ecx; shr %cl,%eax */
+ break;
+ case BPF_S_ALU_RSH_K: /* A >>= K; */
+ if (K == 0)
+ break;
+ else if (K == 1)
+ EMIT2(0xd1, 0xe8); /* shr %eax */
+ else
+ EMIT3(0xc1, 0xe8, K);
+ break;
+ case BPF_S_ALU_NEG:
+ EMIT2(0xf7, 0xd8); /* neg %eax */
+ break;
+ case BPF_S_RET_K:
+ if (!K) {
+ if (pc_ret0 == -1)
+ pc_ret0 = i;
+ CLEAR_A();
+ } else {
+ EMIT1_off32(0xb8, K); /* mov $imm32,%eax */
+ }
+ /* fallinto */
+ case BPF_S_RET_A:
+ if (seen) {
+ if (i != flen - 1) {
+ EMIT_JMP(cleanup_addr - addrs[i]);
+ break;
+ }
+ if (seen & SEEN_XREG)
+ EMIT4(0x48, 0x8b, 0x5d, 0xf8); /* mov -8(%rbp),%rbx */
+ EMIT1(0xc9); /* leaveq */
+ }
+ EMIT1(0xc3); /* ret */
+ break;
+ case BPF_S_MISC_TAX: /* X = A */
+ seen |= SEEN_XREG;
+ EMIT2(0x89, 0xc3); /* mov %eax,%ebx */
+ break;
+ case BPF_S_MISC_TXA: /* A = X */
+ seen |= SEEN_XREG;
+ EMIT2(0x89, 0xd8); /* mov %ebx,%eax */
+ break;
+ case BPF_S_LD_IMM: /* A = K */
+ if (!K)
+ CLEAR_A();
+ else
+ EMIT1_off32(0xb8, K); /* mov $imm32,%eax */
+ break;
+ case BPF_S_LDX_IMM: /* X = K */
+ seen |= SEEN_XREG;
+ if (!K)
+ CLEAR_X();
+ else
+ EMIT1_off32(0xbb, K); /* mov $imm32,%ebx */
+ break;
+ case BPF_S_LD_MEM: /* A = mem[K] : mov off8(%rbp),%eax */
+ seen |= SEEN_MEM;
+ EMIT3(0x8b, 0x45, 0xf0 - K*4);
+ break;
+ case BPF_S_LDX_MEM: /* X = mem[K] : mov off8(%rbp),%ebx */
+ seen |= SEEN_XREG | SEEN_MEM;
+ EMIT3(0x8b, 0x5d, 0xf0 - K*4);
+ break;
+ case BPF_S_ST: /* mem[K] = A : mov %eax,off8(%rbp) */
+ seen |= SEEN_MEM;
+ EMIT3(0x89, 0x45, 0xf0 - K*4);
+ break;
+ case BPF_S_STX: /* mem[K] = X : mov %ebx,off8(%rbp) */
+ seen |= SEEN_XREG | SEEN_MEM;
+ EMIT3(0x89, 0x5d, 0xf0 - K*4);
+ break;
+ case BPF_S_LD_W_LEN: /* A = skb->len; */
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
+ if (is_imm8(offsetof(struct sk_buff, len)))
+ /* mov off8(%rdi),%eax */
+ EMIT3(0x8b, 0x47, offsetof(struct sk_buff, len));
+ else {
+ EMIT2(0x8b, 0x87);
+ EMIT(offsetof(struct sk_buff, len), 4);
+ }
+ break;
+ case BPF_S_LDX_W_LEN: /* X = skb->len; */
+ seen |= SEEN_XREG;
+ if (is_imm8(offsetof(struct sk_buff, len)))
+ /* mov off8(%rdi),%ebx */
+ EMIT3(0x8b, 0x5f, offsetof(struct sk_buff, len));
+ else {
+ EMIT2(0x8b, 0x9f);
+ EMIT(offsetof(struct sk_buff, len), 4);
+ }
+ break;
+ case BPF_S_ANC_PROTOCOL: /* A = ntohs(skb->protocol); */
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
+ if (is_imm8(offsetof(struct sk_buff, protocol))) {
+ /* movzwl off8(%rdi),%eax */
+ EMIT4(0x0f, 0xb7, 0x47, offsetof(struct sk_buff, protocol));
+ } else {
+ EMIT3(0x0f, 0xb7, 0x87); /* movzwl off32(%rdi),%eax */
+ EMIT(offsetof(struct sk_buff, protocol), 4);
+ }
+ EMIT2(0x86, 0xc4); /* ntohs() : xchg %al,%ah */
+ break;
+ case BPF_S_ANC_IFINDEX:
+ if (is_imm8(offsetof(struct sk_buff, dev))) {
+ /* movq off8(%rdi),%rax */
+ EMIT4(0x48, 0x8b, 0x47, offsetof(struct sk_buff, dev));
+ } else {
+ EMIT3(0x48, 0x8b, 0x87); /* movq off32(%rdi),%rax */
+ EMIT(offsetof(struct sk_buff, dev), 4);
+ }
+ EMIT3(0x48, 0x85, 0xc0); /* test %rax,%rax */
+ EMIT_COND_JMP(X86_JE, cleanup_addr - (addrs[i] - 6));
+ BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
+ EMIT2(0x8b, 0x80); /* mov off32(%rax),%eax */
+ EMIT(offsetof(struct net_device, ifindex), 4);
+ break;
+ case BPF_S_ANC_MARK:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
+ if (is_imm8(offsetof(struct sk_buff, mark))) {
+ /* mov off8(%rdi),%eax */
+ EMIT3(0x8b, 0x47, offsetof(struct sk_buff, mark));
+ } else {
+ EMIT2(0x8b, 0x87);
+ EMIT(offsetof(struct sk_buff, mark), 4);
+ }
+ break;
+ case BPF_S_ANC_RXHASH:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, rxhash) != 4);
+ if (is_imm8(offsetof(struct sk_buff, rxhash))) {
+ /* mov off8(%rdi),%eax */
+ EMIT3(0x8b, 0x47, offsetof(struct sk_buff, rxhash));
+ } else {
+ EMIT2(0x8b, 0x87);
+ EMIT(offsetof(struct sk_buff, rxhash), 4);
+ }
+ break;
+ case BPF_S_ANC_QUEUE:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2);
+ if (is_imm8(offsetof(struct sk_buff, queue_mapping))) {
+ /* movzwl off8(%rdi),%eax */
+ EMIT4(0x0f, 0xb7, 0x47, offsetof(struct sk_buff, queue_mapping));
+ } else {
+ EMIT3(0x0f, 0xb7, 0x87); /* movzwl off32(%rdi),%eax */
+ EMIT(offsetof(struct sk_buff, queue_mapping), 4);
+ }
+ break;
+ case BPF_S_ANC_CPU:
+#ifdef CONFIG_SMP
+ EMIT4(0x65, 0x8b, 0x04, 0x25); /* mov %gs:off32,%eax */
+ EMIT((u32)(unsigned long)&cpu_number, 4); /* A = smp_processor_id(); */
+#else
+ CLEAR_A();
+#endif
+ break;
+ case BPF_S_LD_W_ABS:
+ func = sk_load_word;
+common_load: seen |= SEEN_DATAREF;
+ if ((int)K < 0)
+ goto out;
+ t_offset = func - (image + addrs[i]);
+ EMIT1_off32(0xbe, K); /* mov imm32,%esi */
+ EMIT1_off32(0xe8, t_offset); /* call */
+ break;
+ case BPF_S_LD_H_ABS:
+ func = sk_load_half;
+ goto common_load;
+ case BPF_S_LD_B_ABS:
+ func = sk_load_byte;
+ goto common_load;
+ case BPF_S_LDX_B_MSH:
+ if ((int)K < 0) {
+ if (pc_ret0 != -1) {
+ EMIT_JMP(addrs[pc_ret0] - addrs[i]);
+ break;
+ }
+ CLEAR_A();
+ EMIT_JMP(cleanup_addr - addrs[i]);
+ break;
+ }
+ seen |= SEEN_DATAREF | SEEN_XREG;
+ t_offset = sk_load_byte_msh - (image + addrs[i]);
+ EMIT1_off32(0xbe, K); /* mov imm32,%esi */
+ EMIT1_off32(0xe8, t_offset); /* call sk_load_byte_msh */
+ break;
+ case BPF_S_LD_W_IND:
+ func = sk_load_word_ind;
+common_load_ind: seen |= SEEN_DATAREF | SEEN_XREG;
+ t_offset = func - (image + addrs[i]);
+ EMIT1_off32(0xbe, K); /* mov imm32,%esi */
+ EMIT1_off32(0xe8, t_offset); /* call sk_load_xxx_ind */
+ break;
+ case BPF_S_LD_H_IND:
+ func = sk_load_half_ind;
+ goto common_load_ind;
+ case BPF_S_LD_B_IND:
+ func = sk_load_byte_ind;
+ goto common_load_ind;
+ case BPF_S_JMP_JA:
+ t_offset = addrs[i + K] - addrs[i];
+ EMIT_JMP(t_offset);
+ break;
+ COND_SEL(BPF_S_JMP_JGT_K, X86_JA, X86_JBE);
+ COND_SEL(BPF_S_JMP_JGE_K, X86_JAE, X86_JB);
+ COND_SEL(BPF_S_JMP_JEQ_K, X86_JE, X86_JNE);
+ COND_SEL(BPF_S_JMP_JSET_K,X86_JNE, X86_JE);
+ COND_SEL(BPF_S_JMP_JGT_X, X86_JA, X86_JBE);
+ COND_SEL(BPF_S_JMP_JGE_X, X86_JAE, X86_JB);
+ COND_SEL(BPF_S_JMP_JEQ_X, X86_JE, X86_JNE);
+ COND_SEL(BPF_S_JMP_JSET_X,X86_JNE, X86_JE);
+
+cond_branch: f_offset = addrs[i + filter[i].jf] - addrs[i];
+ t_offset = addrs[i + filter[i].jt] - addrs[i];
+
+ /* same targets, can avoid doing the test :) */
+ if (filter[i].jt == filter[i].jf) {
+ EMIT_JMP(t_offset);
+ break;
+ }
+
+ switch (filter[i].code) {
+ case BPF_S_JMP_JGT_X:
+ case BPF_S_JMP_JGE_X:
+ case BPF_S_JMP_JEQ_X:
+ seen |= SEEN_XREG;
+ EMIT2(0x39, 0xd8); /* cmp %ebx,%eax */
+ break;
+ case BPF_S_JMP_JSET_X:
+ seen |= SEEN_XREG;
+ EMIT2(0x85, 0xd8); /* test %ebx,%eax */
+ break;
+ case BPF_S_JMP_JEQ_K:
+ if (K == 0) {
+ EMIT2(0x85, 0xc0); /* test %eax,%eax */
+ break;
+ }
+ case BPF_S_JMP_JGT_K:
+ case BPF_S_JMP_JGE_K:
+ if (K <= 127)
+ EMIT3(0x83, 0xf8, K); /* cmp imm8,%eax */
+ else
+ EMIT1_off32(0x3d, K); /* cmp imm32,%eax */
+ break;
+ case BPF_S_JMP_JSET_K:
+ if (K <= 0xFF)
+ EMIT2(0xa8, K); /* test imm8,%al */
+ else if (!(K & 0xFFFF00FF))
+ EMIT3(0xf6, 0xc4, K >> 8); /* test imm8,%ah */
+ else if (K <= 0xFFFF) {
+ EMIT2(0x66, 0xa9); /* test imm16,%ax */
+ EMIT(K, 2);
+ } else {
+ EMIT1_off32(0xa9, K); /* test imm32,%eax */
+ }
+ break;
+ }
+ if (filter[i].jt != 0) {
+ if (filter[i].jf)
+ t_offset += is_near(f_offset) ? 2 : 6;
+ EMIT_COND_JMP(t_op, t_offset);
+ if (filter[i].jf)
+ EMIT_JMP(f_offset);
+ break;
+ }
+ EMIT_COND_JMP(f_op, f_offset);
+ break;
+ default:
+ /* hmm, too complex filter, give up with jit compiler */
+ goto out;
+ }
+ ilen = prog - temp;
+ if (image) {
+ if (unlikely(proglen + ilen > oldproglen)) {
+ pr_err("bpb_jit_compile fatal error\n");
+ kfree(addrs);
+ module_free(NULL, image);
+ return;
+ }
+ memcpy(image + proglen, temp, ilen);
+ }
+ proglen += ilen;
+ addrs[i] = proglen;
+ prog = temp;
+ }
+ /* last bpf instruction is always a RET :
+ * use it to give the cleanup instruction(s) addr
+ */
+ cleanup_addr = proglen - 1; /* ret */
+ if (seen)
+ cleanup_addr -= 1; /* leaveq */
+ if (seen & SEEN_XREG)
+ cleanup_addr -= 4; /* mov -8(%rbp),%rbx */
+
+ if (image) {
+ WARN_ON(proglen != oldproglen);
+ break;
+ }
+ if (proglen == oldproglen) {
+ image = module_alloc(max_t(unsigned int,
+ proglen,
+ sizeof(struct work_struct)));
+ if (!image)
+ goto out;
+ }
+ oldproglen = proglen;
+ }
+ if (bpf_jit_enable > 1)
+ pr_err("flen=%d proglen=%u pass=%d image=%p\n",
+ flen, proglen, pass, image);
+
+ if (image) {
+ if (bpf_jit_enable > 1)
+ print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_ADDRESS,
+ 16, 1, image, proglen, false);
+
+ bpf_flush_icache(image, image + proglen);
+
+ fp->bpf_func = (void *)image;
+ }
+out:
+ kfree(addrs);
+ return;
+}
+
+static void jit_free_defer(struct work_struct *arg)
+{
+ module_free(NULL, arg);
+}
+
+/* run from softirq, we must use a work_struct to call
+ * module_free() from process context
+ */
+void bpf_jit_free(struct sk_filter *fp)
+{
+ if (fp->bpf_func != sk_run_filter) {
+ struct work_struct *work = (struct work_struct *)fp->bpf_func;
+
+ INIT_WORK(work, jit_free_defer);
+ schedule_work(work);
+ }
+}
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 2d49d4e19a3..bff89dfe361 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -11,21 +11,11 @@
#include <linux/oprofile.h>
#include <linux/sched.h>
#include <linux/mm.h>
-#include <asm/ptrace.h>
-#include <asm/uaccess.h>
-#include <asm/stacktrace.h>
#include <linux/compat.h>
+#include <linux/uaccess.h>
-static void backtrace_warning_symbol(void *data, char *msg,
- unsigned long symbol)
-{
- /* Ignore warnings */
-}
-
-static void backtrace_warning(void *data, char *msg)
-{
- /* Ignore warnings */
-}
+#include <asm/ptrace.h>
+#include <asm/stacktrace.h>
static int backtrace_stack(void *data, char *name)
{
@@ -42,8 +32,6 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
}
static struct stacktrace_ops backtrace_ops = {
- .warning = backtrace_warning,
- .warning_symbol = backtrace_warning_symbol,
.stack = backtrace_stack,
.address = backtrace_address,
.walk_stack = print_context_stack,
@@ -53,13 +41,13 @@ static struct stacktrace_ops backtrace_ops = {
static struct stack_frame_ia32 *
dump_user_backtrace_32(struct stack_frame_ia32 *head)
{
+ /* Also check accessibility of one struct frame_head beyond: */
struct stack_frame_ia32 bufhead[2];
struct stack_frame_ia32 *fp;
+ unsigned long bytes;
- /* Also check accessibility of one struct frame_head beyond */
- if (!access_ok(VERIFY_READ, head, sizeof(bufhead)))
- return NULL;
- if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead)))
+ bytes = copy_from_user_nmi(bufhead, head, sizeof(bufhead));
+ if (bytes != sizeof(bufhead))
return NULL;
fp = (struct stack_frame_ia32 *) compat_ptr(bufhead[0].next_frame);
@@ -100,12 +88,12 @@ x86_backtrace_32(struct pt_regs * const regs, unsigned int depth)
static struct stack_frame *dump_user_backtrace(struct stack_frame *head)
{
+ /* Also check accessibility of one struct frame_head beyond: */
struct stack_frame bufhead[2];
+ unsigned long bytes;
- /* Also check accessibility of one struct stack_frame beyond */
- if (!access_ok(VERIFY_READ, head, sizeof(bufhead)))
- return NULL;
- if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead)))
+ bytes = copy_from_user_nmi(bufhead, head, sizeof(bufhead));
+ if (bytes != sizeof(bufhead))
return NULL;
oprofile_add_trace(bufhead[0].return_address);
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index cf9750004a0..68894fdc034 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -112,8 +112,10 @@ static void nmi_cpu_start(void *dummy)
static int nmi_start(void)
{
get_online_cpus();
- on_each_cpu(nmi_cpu_start, NULL, 1);
ctr_running = 1;
+ /* make ctr_running visible to the nmi handler: */
+ smp_mb();
+ on_each_cpu(nmi_cpu_start, NULL, 1);
put_online_cpus();
return 0;
}
@@ -504,15 +506,18 @@ static int nmi_setup(void)
nmi_enabled = 0;
ctr_running = 0;
- barrier();
+ /* make variables visible to the nmi handler: */
+ smp_mb();
err = register_die_notifier(&profile_exceptions_nb);
if (err)
goto fail;
get_online_cpus();
register_cpu_notifier(&oprofile_cpu_nb);
- on_each_cpu(nmi_cpu_setup, NULL, 1);
nmi_enabled = 1;
+ /* make nmi_enabled visible to the nmi handler: */
+ smp_mb();
+ on_each_cpu(nmi_cpu_setup, NULL, 1);
put_online_cpus();
return 0;
@@ -531,7 +536,8 @@ static void nmi_shutdown(void)
nmi_enabled = 0;
ctr_running = 0;
put_online_cpus();
- barrier();
+ /* make variables visible to the nmi handler: */
+ smp_mb();
unregister_die_notifier(&profile_exceptions_nb);
msrs = &get_cpu_var(cpu_msrs);
model->shutdown(msrs);
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index c3b8e24f2b1..9cbb710dc94 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -316,16 +316,23 @@ static void op_amd_stop_ibs(void)
wrmsrl(MSR_AMD64_IBSOPCTL, 0);
}
-static inline int eilvt_is_available(int offset)
+static inline int get_eilvt(int offset)
{
- /* check if we may assign a vector */
return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1);
}
+static inline int put_eilvt(int offset)
+{
+ return !setup_APIC_eilvt(offset, 0, 0, 1);
+}
+
static inline int ibs_eilvt_valid(void)
{
int offset;
u64 val;
+ int valid = 0;
+
+ preempt_disable();
rdmsrl(MSR_AMD64_IBSCTL, val);
offset = val & IBSCTL_LVT_OFFSET_MASK;
@@ -333,16 +340,20 @@ static inline int ibs_eilvt_valid(void)
if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n",
smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
- return 0;
+ goto out;
}
- if (!eilvt_is_available(offset)) {
+ if (!get_eilvt(offset)) {
pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n",
smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
- return 0;
+ goto out;
}
- return 1;
+ valid = 1;
+out:
+ preempt_enable();
+
+ return valid;
}
static inline int get_ibs_offset(void)
@@ -598,69 +609,76 @@ static int setup_ibs_ctl(int ibs_eilvt_off)
return 0;
}
+/*
+ * This runs only on the current cpu. We try to find an LVT offset and
+ * setup the local APIC. For this we must disable preemption. On
+ * success we initialize all nodes with this offset. This updates then
+ * the offset in the IBS_CTL per-node msr. The per-core APIC setup of
+ * the IBS interrupt vector is called from op_amd_setup_ctrs()/op_-
+ * amd_cpu_shutdown() using the new offset.
+ */
static int force_ibs_eilvt_setup(void)
{
- int i;
+ int offset;
int ret;
- /* find the next free available EILVT entry */
- for (i = 1; i < 4; i++) {
- if (!eilvt_is_available(i))
- continue;
- ret = setup_ibs_ctl(i);
- if (ret)
- return ret;
- pr_err(FW_BUG "using offset %d for IBS interrupts\n", i);
- return 0;
+ preempt_disable();
+ /* find the next free available EILVT entry, skip offset 0 */
+ for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) {
+ if (get_eilvt(offset))
+ break;
}
+ preempt_enable();
- printk(KERN_DEBUG "No EILVT entry available\n");
-
- return -EBUSY;
-}
-
-static int __init_ibs_nmi(void)
-{
- int ret;
-
- if (ibs_eilvt_valid())
- return 0;
+ if (offset == APIC_EILVT_NR_MAX) {
+ printk(KERN_DEBUG "No EILVT entry available\n");
+ return -EBUSY;
+ }
- ret = force_ibs_eilvt_setup();
+ ret = setup_ibs_ctl(offset);
if (ret)
- return ret;
+ goto out;
- if (!ibs_eilvt_valid())
- return -EFAULT;
+ if (!ibs_eilvt_valid()) {
+ ret = -EFAULT;
+ goto out;
+ }
+ pr_err(FW_BUG "using offset %d for IBS interrupts\n", offset);
pr_err(FW_BUG "workaround enabled for IBS LVT offset\n");
return 0;
+out:
+ preempt_disable();
+ put_eilvt(offset);
+ preempt_enable();
+ return ret;
}
/*
* check and reserve APIC extended interrupt LVT offset for IBS if
* available
- *
- * init_ibs() preforms implicitly cpu-local operations, so pin this
- * thread to its current CPU
*/
static void init_ibs(void)
{
- preempt_disable();
-
ibs_caps = get_ibs_caps();
+
if (!ibs_caps)
+ return;
+
+ if (ibs_eilvt_valid())
goto out;
- if (__init_ibs_nmi() < 0)
- ibs_caps = 0;
- else
- printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps);
+ if (!force_ibs_eilvt_setup())
+ goto out;
+
+ /* Failed to setup ibs */
+ ibs_caps = 0;
+ return;
out:
- preempt_enable();
+ printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps);
}
static int (*create_arch_files)(struct super_block *sb, struct dentry *root);
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 0972315c386..ae3cb23cd89 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -188,7 +188,7 @@ static bool resource_contains(struct resource *res, resource_size_t point)
return false;
}
-static void coalesce_windows(struct pci_root_info *info, int type)
+static void coalesce_windows(struct pci_root_info *info, unsigned long type)
{
int i, j;
struct resource *res1, *res2;
@@ -246,10 +246,9 @@ static void add_resources(struct pci_root_info *info)
conflict = insert_resource_conflict(root, res);
if (conflict)
- dev_err(&info->bridge->dev,
- "address space collision: host bridge window %pR "
- "conflicts with %s %pR\n",
- res, conflict->name, conflict);
+ dev_info(&info->bridge->dev,
+ "ignoring host bridge window %pR (conflicts with %s %pR)\n",
+ res, conflict->name, conflict);
else
pci_bus_add_resource(info->bus, res, 0);
}
diff --git a/arch/x86/pci/ce4100.c b/arch/x86/pci/ce4100.c
index 67858be4b52..99176094500 100644
--- a/arch/x86/pci/ce4100.c
+++ b/arch/x86/pci/ce4100.c
@@ -257,6 +257,7 @@ static int ce4100_conf_read(unsigned int seg, unsigned int bus,
{
int i;
+ WARN_ON(seg);
if (bus == 1) {
for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
if (bus1_fixups[i].dev_func == devfn &&
@@ -282,6 +283,7 @@ static int ce4100_conf_write(unsigned int seg, unsigned int bus,
{
int i;
+ WARN_ON(seg);
if (bus == 1) {
for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
if (bus1_fixups[i].dev_func == devfn &&
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 5fe75026ecc..92df322e0b5 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -247,13 +247,6 @@ static const struct dmi_system_id __devinitconst pciprobe_dmi_table[] = {
},
#endif /* __i386__ */
{
- .callback = find_sort_method,
- .ident = "Dell System",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc"),
- },
- },
- {
.callback = set_bf_sort,
.ident = "Dell PowerEdge 1950",
.matches = {
@@ -294,6 +287,13 @@ static const struct dmi_system_id __devinitconst pciprobe_dmi_table[] = {
},
},
{
+ .callback = find_sort_method,
+ .ident = "Dell System",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc"),
+ },
+ },
+ {
.callback = set_bf_sort,
.ident = "HP ProLiant BL20p G3",
.matches = {
diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c
index bd33620b007..4f2c70439d7 100644
--- a/arch/x86/pci/direct.c
+++ b/arch/x86/pci/direct.c
@@ -22,7 +22,7 @@ static int pci_conf1_read(unsigned int seg, unsigned int bus,
{
unsigned long flags;
- if ((bus > 255) || (devfn > 255) || (reg > 4095)) {
+ if (seg || (bus > 255) || (devfn > 255) || (reg > 4095)) {
*value = -1;
return -EINVAL;
}
@@ -53,7 +53,7 @@ static int pci_conf1_write(unsigned int seg, unsigned int bus,
{
unsigned long flags;
- if ((bus > 255) || (devfn > 255) || (reg > 4095))
+ if (seg || (bus > 255) || (devfn > 255) || (reg > 4095))
return -EINVAL;
raw_spin_lock_irqsave(&pci_config_lock, flags);
@@ -97,6 +97,7 @@ static int pci_conf2_read(unsigned int seg, unsigned int bus,
unsigned long flags;
int dev, fn;
+ WARN_ON(seg);
if ((bus > 255) || (devfn > 255) || (reg > 255)) {
*value = -1;
return -EINVAL;
@@ -138,6 +139,7 @@ static int pci_conf2_write(unsigned int seg, unsigned int bus,
unsigned long flags;
int dev, fn;
+ WARN_ON(seg);
if ((bus > 255) || (devfn > 255) || (reg > 255))
return -EINVAL;
@@ -280,12 +282,9 @@ void __init pci_direct_init(int type)
int __init pci_direct_probe(void)
{
- struct resource *region, *region2;
-
if ((pci_probe & PCI_PROBE_CONF1) == 0)
goto type2;
- region = request_region(0xCF8, 8, "PCI conf1");
- if (!region)
+ if (!request_region(0xCF8, 8, "PCI conf1"))
goto type2;
if (pci_check_type1()) {
@@ -293,16 +292,14 @@ int __init pci_direct_probe(void)
port_cf9_safe = true;
return 1;
}
- release_resource(region);
+ release_region(0xCF8, 8);
type2:
if ((pci_probe & PCI_PROBE_CONF2) == 0)
return 0;
- region = request_region(0xCF8, 4, "PCI conf2");
- if (!region)
+ if (!request_region(0xCF8, 4, "PCI conf2"))
return 0;
- region2 = request_region(0xC000, 0x1000, "PCI conf2");
- if (!region2)
+ if (!request_region(0xC000, 0x1000, "PCI conf2"))
goto fail2;
if (pci_check_type2()) {
@@ -311,8 +308,8 @@ int __init pci_direct_probe(void)
return 2;
}
- release_resource(region2);
+ release_region(0xC000, 0x1000);
fail2:
- release_resource(region);
+ release_region(0xCF8, 4);
return 0;
}
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 8201165bae2..372e9b8989b 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -602,7 +602,9 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
|| (device >= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN &&
device <= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX)
|| (device >= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MIN &&
- device <= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MAX)) {
+ device <= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MAX)
+ || (device >= PCI_DEVICE_ID_INTEL_PANTHERPOINT_LPC_MIN &&
+ device <= PCI_DEVICE_ID_INTEL_PANTHERPOINT_LPC_MAX)) {
r->name = "PIIX/ICH";
r->get = pirq_piix_get;
r->set = pirq_piix_set;
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index e282886616a..301e325992f 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -519,7 +519,8 @@ static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg,
if (cfg->address < 0xFFFFFFFF)
return 0;
- if (!strcmp(mcfg->header.oem_id, "SGI"))
+ if (!strcmp(mcfg->header.oem_id, "SGI") ||
+ !strcmp(mcfg->header.oem_id, "SGI2"))
return 0;
if (mcfg->header.revision >= 1) {
@@ -606,6 +607,16 @@ static void __init __pci_mmcfg_init(int early)
if (list_empty(&pci_mmcfg_list))
return;
+ if (pcibios_last_bus < 0) {
+ const struct pci_mmcfg_region *cfg;
+
+ list_for_each_entry(cfg, &pci_mmcfg_list, list) {
+ if (cfg->segment)
+ break;
+ pcibios_last_bus = cfg->end_bus;
+ }
+ }
+
if (pci_mmcfg_arch_init())
pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
else {
diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c
index 5c9e2458df4..512a88c4150 100644
--- a/arch/x86/pci/numaq_32.c
+++ b/arch/x86/pci/numaq_32.c
@@ -34,6 +34,7 @@ static int pci_conf1_mq_read(unsigned int seg, unsigned int bus,
unsigned long flags;
void *adr __iomem = XQUAD_PORT_ADDR(0xcfc, BUS2QUAD(bus));
+ WARN_ON(seg);
if (!value || (bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255))
return -EINVAL;
@@ -73,6 +74,7 @@ static int pci_conf1_mq_write(unsigned int seg, unsigned int bus,
unsigned long flags;
void *adr __iomem = XQUAD_PORT_ADDR(0xcfc, BUS2QUAD(bus));
+ WARN_ON(seg);
if ((bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255))
return -EINVAL;
diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c
index 13700ec8e2e..5262603b04d 100644
--- a/arch/x86/pci/olpc.c
+++ b/arch/x86/pci/olpc.c
@@ -206,6 +206,8 @@ static int pci_olpc_read(unsigned int seg, unsigned int bus,
{
uint32_t *addr;
+ WARN_ON(seg);
+
/* Use the hardware mechanism for non-simulated devices */
if (!is_simulated(bus, devfn))
return pci_direct_conf1.read(seg, bus, devfn, reg, len, value);
@@ -264,6 +266,8 @@ static int pci_olpc_read(unsigned int seg, unsigned int bus,
static int pci_olpc_write(unsigned int seg, unsigned int bus,
unsigned int devfn, int reg, int len, uint32_t value)
{
+ WARN_ON(seg);
+
/* Use the hardware mechanism for non-simulated devices */
if (!is_simulated(bus, devfn))
return pci_direct_conf1.write(seg, bus, devfn, reg, len, value);
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index a5f7d0d63de..f6855355146 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -181,6 +181,7 @@ static int pci_bios_read(unsigned int seg, unsigned int bus,
unsigned long flags;
unsigned long bx = (bus << 8) | devfn;
+ WARN_ON(seg);
if (!value || (bus > 255) || (devfn > 255) || (reg > 255))
return -EINVAL;
@@ -247,6 +248,7 @@ static int pci_bios_write(unsigned int seg, unsigned int bus,
unsigned long flags;
unsigned long bx = (bus << 8) | devfn;
+ WARN_ON(seg);
if ((bus > 255) || (devfn > 255) || (reg > 255))
return -EINVAL;
diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c
index 03008f72eb0..6f2f8eeed17 100644
--- a/arch/x86/pci/visws.c
+++ b/arch/x86/pci/visws.c
@@ -24,7 +24,7 @@ static void pci_visws_disable_irq(struct pci_dev *dev) { }
unsigned int pci_bus0, pci_bus1;
-static int __init visws_map_irq(struct pci_dev *dev, u8 slot, u8 pin)
+static int __init visws_map_irq(const struct pci_dev *dev, u8 slot, u8 pin)
{
int irq, bus = dev->bus->number;
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index e37b407a0ee..1017c7bee38 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -1,8 +1,13 @@
/*
- * Xen PCI Frontend Stub - puts some "dummy" functions in to the Linux
- * x86 PCI core to support the Xen PCI Frontend
+ * Xen PCI - handle PCI (INTx) and MSI infrastructure calls for PV, HVM and
+ * initial domain support. We also handle the DSDT _PRT callbacks for GSI's
+ * used in HVM and initial domain mode (PV does not parse ACPI, so it has no
+ * concept of GSIs). Under PV we hook under the pnbbios API for IRQs and
+ * 0xcf8 PCI configuration read/write.
*
* Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ * Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+ * Stefano Stabellini <stefano.stabellini@eu.citrix.com>
*/
#include <linux/module.h>
#include <linux/init.h>
@@ -19,22 +24,53 @@
#include <xen/events.h>
#include <asm/xen/pci.h>
+static int xen_pcifront_enable_irq(struct pci_dev *dev)
+{
+ int rc;
+ int share = 1;
+ int pirq;
+ u8 gsi;
+
+ rc = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi);
+ if (rc < 0) {
+ dev_warn(&dev->dev, "Xen PCI: failed to read interrupt line: %d\n",
+ rc);
+ return rc;
+ }
+ /* In PV DomU the Xen PCI backend puts the PIRQ in the interrupt line.*/
+ pirq = gsi;
+
+ if (gsi < NR_IRQS_LEGACY)
+ share = 0;
+
+ rc = xen_bind_pirq_gsi_to_irq(gsi, pirq, share, "pcifront");
+ if (rc < 0) {
+ dev_warn(&dev->dev, "Xen PCI: failed to bind GSI%d (PIRQ%d) to IRQ: %d\n",
+ gsi, pirq, rc);
+ return rc;
+ }
+
+ dev->irq = rc;
+ dev_info(&dev->dev, "Xen PCI mapped GSI%d to IRQ%d\n", gsi, dev->irq);
+ return 0;
+}
+
#ifdef CONFIG_ACPI
-static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
- int trigger, int polarity)
+static int xen_register_pirq(u32 gsi, int gsi_override, int triggering,
+ bool set_pirq)
{
- int rc, irq;
+ int rc, pirq = -1, irq = -1;
struct physdev_map_pirq map_irq;
int shareable = 0;
char *name;
- if (!xen_hvm_domain())
- return -1;
+ if (set_pirq)
+ pirq = gsi;
map_irq.domid = DOMID_SELF;
map_irq.type = MAP_PIRQ_TYPE_GSI;
map_irq.index = gsi;
- map_irq.pirq = -1;
+ map_irq.pirq = pirq;
rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
if (rc) {
@@ -42,7 +78,7 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
return -1;
}
- if (trigger == ACPI_EDGE_SENSITIVE) {
+ if (triggering == ACPI_EDGE_SENSITIVE) {
shareable = 0;
name = "ioapic-edge";
} else {
@@ -50,12 +86,63 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
name = "ioapic-level";
}
+ if (gsi_override >= 0)
+ gsi = gsi_override;
+
irq = xen_bind_pirq_gsi_to_irq(gsi, map_irq.pirq, shareable, name);
+ if (irq < 0)
+ goto out;
- printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq);
+ printk(KERN_DEBUG "xen: --> pirq=%d -> irq=%d (gsi=%d)\n", map_irq.pirq, irq, gsi);
+out:
+ return irq;
+}
+
+static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
+ int trigger, int polarity)
+{
+ if (!xen_hvm_domain())
+ return -1;
+
+ return xen_register_pirq(gsi, -1 /* no GSI override */, trigger,
+ false /* no mapping of GSI to PIRQ */);
+}
+
+#ifdef CONFIG_XEN_DOM0
+static int xen_register_gsi(u32 gsi, int gsi_override, int triggering, int polarity)
+{
+ int rc, irq;
+ struct physdev_setup_gsi setup_gsi;
+
+ if (!xen_pv_domain())
+ return -1;
+
+ printk(KERN_DEBUG "xen: registering gsi %u triggering %d polarity %d\n",
+ gsi, triggering, polarity);
+
+ irq = xen_register_pirq(gsi, gsi_override, triggering, true);
+
+ setup_gsi.gsi = gsi;
+ setup_gsi.triggering = (triggering == ACPI_EDGE_SENSITIVE ? 0 : 1);
+ setup_gsi.polarity = (polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi);
+ if (rc == -EEXIST)
+ printk(KERN_INFO "Already setup the GSI :%d\n", gsi);
+ else if (rc) {
+ printk(KERN_ERR "Failed to setup GSI :%d, err_code:%d\n",
+ gsi, rc);
+ }
return irq;
}
+
+static int acpi_register_gsi_xen(struct device *dev, u32 gsi,
+ int trigger, int polarity)
+{
+ return xen_register_gsi(gsi, -1 /* no GSI override */, trigger, polarity);
+}
+#endif
#endif
#if defined(CONFIG_PCI_MSI)
@@ -65,6 +152,43 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
struct xen_pci_frontend_ops *xen_pci_frontend;
EXPORT_SYMBOL_GPL(xen_pci_frontend);
+static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+ int irq, ret, i;
+ struct msi_desc *msidesc;
+ int *v;
+
+ v = kzalloc(sizeof(int) * max(1, nvec), GFP_KERNEL);
+ if (!v)
+ return -ENOMEM;
+
+ if (type == PCI_CAP_ID_MSIX)
+ ret = xen_pci_frontend_enable_msix(dev, v, nvec);
+ else
+ ret = xen_pci_frontend_enable_msi(dev, v);
+ if (ret)
+ goto error;
+ i = 0;
+ list_for_each_entry(msidesc, &dev->msi_list, list) {
+ irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 0,
+ (type == PCI_CAP_ID_MSIX) ?
+ "pcifront-msi-x" :
+ "pcifront-msi",
+ DOMID_SELF);
+ if (irq < 0)
+ goto free;
+ i++;
+ }
+ kfree(v);
+ return 0;
+
+error:
+ dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n");
+free:
+ kfree(v);
+ return ret;
+}
+
#define XEN_PIRQ_MSI_DATA (MSI_DATA_TRIGGER_EDGE | \
MSI_DATA_LEVEL_ASSERT | (3 << 8) | MSI_DATA_VECTOR(0))
@@ -108,7 +232,8 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
}
irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq, 0,
(type == PCI_CAP_ID_MSIX) ?
- "msi-x" : "msi");
+ "msi-x" : "msi",
+ DOMID_SELF);
if (irq < 0)
goto error;
dev_dbg(&dev->dev,
@@ -122,66 +247,6 @@ error:
return -ENODEV;
}
-/*
- * For MSI interrupts we have to use drivers/xen/event.s functions to
- * allocate an irq_desc and setup the right */
-
-
-static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
-{
- int irq, ret, i;
- struct msi_desc *msidesc;
- int *v;
-
- v = kzalloc(sizeof(int) * max(1, nvec), GFP_KERNEL);
- if (!v)
- return -ENOMEM;
-
- if (type == PCI_CAP_ID_MSIX)
- ret = xen_pci_frontend_enable_msix(dev, v, nvec);
- else
- ret = xen_pci_frontend_enable_msi(dev, v);
- if (ret)
- goto error;
- i = 0;
- list_for_each_entry(msidesc, &dev->msi_list, list) {
- irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 0,
- (type == PCI_CAP_ID_MSIX) ?
- "pcifront-msi-x" :
- "pcifront-msi");
- if (irq < 0)
- goto free;
- i++;
- }
- kfree(v);
- return 0;
-
-error:
- dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n");
-free:
- kfree(v);
- return ret;
-}
-
-static void xen_teardown_msi_irqs(struct pci_dev *dev)
-{
- struct msi_desc *msidesc;
-
- msidesc = list_entry(dev->msi_list.next, struct msi_desc, list);
- if (msidesc->msi_attrib.is_msix)
- xen_pci_frontend_disable_msix(dev);
- else
- xen_pci_frontend_disable_msi(dev);
-
- /* Free the IRQ's and the msidesc using the generic code. */
- default_teardown_msi_irqs(dev);
-}
-
-static void xen_teardown_msi_irq(unsigned int irq)
-{
- xen_destroy_irq(irq);
-}
-
#ifdef CONFIG_XEN_DOM0
static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
{
@@ -190,9 +255,16 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
list_for_each_entry(msidesc, &dev->msi_list, list) {
struct physdev_map_pirq map_irq;
+ domid_t domid;
+
+ domid = ret = xen_find_device_domain_owner(dev);
+ /* N.B. Casting int's -ENODEV to uint16_t results in 0xFFED,
+ * hence check ret value for < 0. */
+ if (ret < 0)
+ domid = DOMID_SELF;
memset(&map_irq, 0, sizeof(map_irq));
- map_irq.domid = DOMID_SELF;
+ map_irq.domid = domid;
map_irq.type = MAP_PIRQ_TYPE_MSI;
map_irq.index = -1;
map_irq.pirq = -1;
@@ -215,14 +287,16 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
if (ret) {
- dev_warn(&dev->dev, "xen map irq failed %d\n", ret);
+ dev_warn(&dev->dev, "xen map irq failed %d for %d domain\n",
+ ret, domid);
goto out;
}
ret = xen_bind_pirq_msi_to_irq(dev, msidesc,
map_irq.pirq, map_irq.index,
(type == PCI_CAP_ID_MSIX) ?
- "msi-x" : "msi");
+ "msi-x" : "msi",
+ domid);
if (ret < 0)
goto out;
}
@@ -231,45 +305,28 @@ out:
return ret;
}
#endif
-#endif
-static int xen_pcifront_enable_irq(struct pci_dev *dev)
+static void xen_teardown_msi_irqs(struct pci_dev *dev)
{
- int rc;
- int share = 1;
- int pirq;
- u8 gsi;
-
- rc = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi);
- if (rc < 0) {
- dev_warn(&dev->dev, "Xen PCI: failed to read interrupt line: %d\n",
- rc);
- return rc;
- }
-
- rc = xen_allocate_pirq_gsi(gsi);
- if (rc < 0) {
- dev_warn(&dev->dev, "Xen PCI: failed to allocate a PIRQ for GSI%d: %d\n",
- gsi, rc);
- return rc;
- }
- pirq = rc;
+ struct msi_desc *msidesc;
- if (gsi < NR_IRQS_LEGACY)
- share = 0;
+ msidesc = list_entry(dev->msi_list.next, struct msi_desc, list);
+ if (msidesc->msi_attrib.is_msix)
+ xen_pci_frontend_disable_msix(dev);
+ else
+ xen_pci_frontend_disable_msi(dev);
- rc = xen_bind_pirq_gsi_to_irq(gsi, pirq, share, "pcifront");
- if (rc < 0) {
- dev_warn(&dev->dev, "Xen PCI: failed to bind GSI%d (PIRQ%d) to IRQ: %d\n",
- gsi, pirq, rc);
- return rc;
- }
+ /* Free the IRQ's and the msidesc using the generic code. */
+ default_teardown_msi_irqs(dev);
+}
- dev->irq = rc;
- dev_info(&dev->dev, "Xen PCI mapped GSI%d to IRQ%d\n", gsi, dev->irq);
- return 0;
+static void xen_teardown_msi_irq(unsigned int irq)
+{
+ xen_destroy_irq(irq);
}
+#endif
+
int __init pci_xen_init(void)
{
if (!xen_pv_domain() || xen_initial_domain())
@@ -316,82 +373,13 @@ int __init pci_xen_hvm_init(void)
}
#ifdef CONFIG_XEN_DOM0
-static int xen_register_pirq(u32 gsi, int triggering)
-{
- int rc, pirq, irq = -1;
- struct physdev_map_pirq map_irq;
- int shareable = 0;
- char *name;
-
- if (!xen_pv_domain())
- return -1;
-
- if (triggering == ACPI_EDGE_SENSITIVE) {
- shareable = 0;
- name = "ioapic-edge";
- } else {
- shareable = 1;
- name = "ioapic-level";
- }
-
- pirq = xen_allocate_pirq_gsi(gsi);
- if (pirq < 0)
- goto out;
-
- irq = xen_bind_pirq_gsi_to_irq(gsi, pirq, shareable, name);
- if (irq < 0)
- goto out;
-
- printk(KERN_DEBUG "xen: --> pirq=%d -> irq=%d\n", pirq, irq);
-
- map_irq.domid = DOMID_SELF;
- map_irq.type = MAP_PIRQ_TYPE_GSI;
- map_irq.index = gsi;
- map_irq.pirq = pirq;
-
- rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
- if (rc) {
- printk(KERN_WARNING "xen map irq failed %d\n", rc);
- return -1;
- }
-
-out:
- return irq;
-}
-
-static int xen_register_gsi(u32 gsi, int triggering, int polarity)
-{
- int rc, irq;
- struct physdev_setup_gsi setup_gsi;
-
- if (!xen_pv_domain())
- return -1;
-
- printk(KERN_DEBUG "xen: registering gsi %u triggering %d polarity %d\n",
- gsi, triggering, polarity);
-
- irq = xen_register_pirq(gsi, triggering);
-
- setup_gsi.gsi = gsi;
- setup_gsi.triggering = (triggering == ACPI_EDGE_SENSITIVE ? 0 : 1);
- setup_gsi.polarity = (polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
-
- rc = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi);
- if (rc == -EEXIST)
- printk(KERN_INFO "Already setup the GSI :%d\n", gsi);
- else if (rc) {
- printk(KERN_ERR "Failed to setup GSI :%d, err_code:%d\n",
- gsi, rc);
- }
-
- return irq;
-}
-
static __init void xen_setup_acpi_sci(void)
{
int rc;
int trigger, polarity;
int gsi = acpi_sci_override_gsi;
+ int irq = -1;
+ int gsi_override = -1;
if (!gsi)
return;
@@ -404,60 +392,131 @@ static __init void xen_setup_acpi_sci(void)
}
trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE;
polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH;
-
+
printk(KERN_INFO "xen: sci override: global_irq=%d trigger=%d "
"polarity=%d\n", gsi, trigger, polarity);
- gsi = xen_register_gsi(gsi, trigger, polarity);
+ /* Before we bind the GSI to a Linux IRQ, check whether
+ * we need to override it with bus_irq (IRQ) value. Usually for
+ * IRQs below IRQ_LEGACY_IRQ this holds IRQ == GSI, as so:
+ * ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 9 low level)
+ * but there are oddballs where the IRQ != GSI:
+ * ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 20 low level)
+ * which ends up being: gsi_to_irq[9] == 20
+ * (which is what acpi_gsi_to_irq ends up calling when starting the
+ * the ACPI interpreter and keels over since IRQ 9 has not been
+ * setup as we had setup IRQ 20 for it).
+ */
+ if (acpi_gsi_to_irq(gsi, &irq) == 0) {
+ /* Use the provided value if it's valid. */
+ if (irq >= 0)
+ gsi_override = irq;
+ }
+
+ gsi = xen_register_gsi(gsi, gsi_override, trigger, polarity);
printk(KERN_INFO "xen: acpi sci %d\n", gsi);
return;
}
-static int acpi_register_gsi_xen(struct device *dev, u32 gsi,
- int trigger, int polarity)
+int __init pci_xen_initial_domain(void)
{
- return xen_register_gsi(gsi, trigger, polarity);
-}
+ int irq;
-static int __init pci_xen_initial_domain(void)
-{
#ifdef CONFIG_PCI_MSI
x86_msi.setup_msi_irqs = xen_initdom_setup_msi_irqs;
x86_msi.teardown_msi_irq = xen_teardown_msi_irq;
#endif
xen_setup_acpi_sci();
__acpi_register_gsi = acpi_register_gsi_xen;
+ /* Pre-allocate legacy irqs */
+ for (irq = 0; irq < NR_IRQS_LEGACY; irq++) {
+ int trigger, polarity;
+
+ if (acpi_get_override_irq(irq, &trigger, &polarity) == -1)
+ continue;
+ xen_register_pirq(irq, -1 /* no GSI override */,
+ trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE,
+ true /* Map GSI to PIRQ */);
+ }
+ if (0 == nr_ioapics) {
+ for (irq = 0; irq < NR_IRQS_LEGACY; irq++)
+ xen_bind_pirq_gsi_to_irq(irq, irq, 0, "xt-pic");
+ }
return 0;
}
-void __init xen_setup_pirqs(void)
-{
- int pirq, irq;
+struct xen_device_domain_owner {
+ domid_t domain;
+ struct pci_dev *dev;
+ struct list_head list;
+};
- pci_xen_initial_domain();
+static DEFINE_SPINLOCK(dev_domain_list_spinlock);
+static struct list_head dev_domain_list = LIST_HEAD_INIT(dev_domain_list);
- if (0 == nr_ioapics) {
- for (irq = 0; irq < NR_IRQS_LEGACY; irq++) {
- pirq = xen_allocate_pirq_gsi(irq);
- if (WARN(pirq < 0,
- "Could not allocate PIRQ for legacy interrupt\n"))
- break;
- irq = xen_bind_pirq_gsi_to_irq(irq, pirq, 0, "xt-pic");
- }
- return;
+static struct xen_device_domain_owner *find_device(struct pci_dev *dev)
+{
+ struct xen_device_domain_owner *owner;
+
+ list_for_each_entry(owner, &dev_domain_list, list) {
+ if (owner->dev == dev)
+ return owner;
}
+ return NULL;
+}
- /* Pre-allocate legacy irqs */
- for (irq = 0; irq < NR_IRQS_LEGACY; irq++) {
- int trigger, polarity;
+int xen_find_device_domain_owner(struct pci_dev *dev)
+{
+ struct xen_device_domain_owner *owner;
+ int domain = -ENODEV;
+
+ spin_lock(&dev_domain_list_spinlock);
+ owner = find_device(dev);
+ if (owner)
+ domain = owner->domain;
+ spin_unlock(&dev_domain_list_spinlock);
+ return domain;
+}
+EXPORT_SYMBOL_GPL(xen_find_device_domain_owner);
- if (acpi_get_override_irq(irq, &trigger, &polarity) == -1)
- continue;
+int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain)
+{
+ struct xen_device_domain_owner *owner;
- xen_register_pirq(irq,
- trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE);
+ owner = kzalloc(sizeof(struct xen_device_domain_owner), GFP_KERNEL);
+ if (!owner)
+ return -ENODEV;
+
+ spin_lock(&dev_domain_list_spinlock);
+ if (find_device(dev)) {
+ spin_unlock(&dev_domain_list_spinlock);
+ kfree(owner);
+ return -EEXIST;
+ }
+ owner->domain = domain;
+ owner->dev = dev;
+ list_add_tail(&owner->list, &dev_domain_list);
+ spin_unlock(&dev_domain_list_spinlock);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xen_register_device_domain_owner);
+
+int xen_unregister_device_domain_owner(struct pci_dev *dev)
+{
+ struct xen_device_domain_owner *owner;
+
+ spin_lock(&dev_domain_list_spinlock);
+ owner = find_device(dev);
+ if (!owner) {
+ spin_unlock(&dev_domain_list_spinlock);
+ return -ENODEV;
}
+ list_del(&owner->list);
+ spin_unlock(&dev_domain_list_spinlock);
+ kfree(owner);
+ return 0;
}
+EXPORT_SYMBOL_GPL(xen_unregister_device_domain_owner);
#endif
diff --git a/arch/x86/platform/ce4100/falconfalls.dts b/arch/x86/platform/ce4100/falconfalls.dts
index 2d6d226f2b1..e70be38ce03 100644
--- a/arch/x86/platform/ce4100/falconfalls.dts
+++ b/arch/x86/platform/ce4100/falconfalls.dts
@@ -347,7 +347,7 @@
"pciclass0c03";
reg = <0x16800 0x0 0x0 0x0 0x0>;
- interrupts = <22 3>;
+ interrupts = <22 1>;
};
usb@d,1 {
@@ -357,7 +357,7 @@
"pciclass0c03";
reg = <0x16900 0x0 0x0 0x0 0x0>;
- interrupts = <22 3>;
+ interrupts = <22 1>;
};
sata@e,0 {
@@ -367,7 +367,7 @@
"pciclass0106";
reg = <0x17000 0x0 0x0 0x0 0x0>;
- interrupts = <23 3>;
+ interrupts = <23 1>;
};
flash@f,0 {
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 0fe27d7c625..3ae4128013e 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -51,7 +51,17 @@
int efi_enabled;
EXPORT_SYMBOL(efi_enabled);
-struct efi efi;
+struct efi __read_mostly efi = {
+ .mps = EFI_INVALID_TABLE_ADDR,
+ .acpi = EFI_INVALID_TABLE_ADDR,
+ .acpi20 = EFI_INVALID_TABLE_ADDR,
+ .smbios = EFI_INVALID_TABLE_ADDR,
+ .sal_systab = EFI_INVALID_TABLE_ADDR,
+ .boot_info = EFI_INVALID_TABLE_ADDR,
+ .hcdp = EFI_INVALID_TABLE_ADDR,
+ .uga = EFI_INVALID_TABLE_ADDR,
+ .uv_systab = EFI_INVALID_TABLE_ADDR,
+};
EXPORT_SYMBOL(efi);
struct efi_memory_map memmap;
@@ -79,26 +89,50 @@ early_param("add_efi_memmap", setup_add_efi_memmap);
static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
{
- return efi_call_virt2(get_time, tm, tc);
+ unsigned long flags;
+ efi_status_t status;
+
+ spin_lock_irqsave(&rtc_lock, flags);
+ status = efi_call_virt2(get_time, tm, tc);
+ spin_unlock_irqrestore(&rtc_lock, flags);
+ return status;
}
static efi_status_t virt_efi_set_time(efi_time_t *tm)
{
- return efi_call_virt1(set_time, tm);
+ unsigned long flags;
+ efi_status_t status;
+
+ spin_lock_irqsave(&rtc_lock, flags);
+ status = efi_call_virt1(set_time, tm);
+ spin_unlock_irqrestore(&rtc_lock, flags);
+ return status;
}
static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled,
efi_bool_t *pending,
efi_time_t *tm)
{
- return efi_call_virt3(get_wakeup_time,
- enabled, pending, tm);
+ unsigned long flags;
+ efi_status_t status;
+
+ spin_lock_irqsave(&rtc_lock, flags);
+ status = efi_call_virt3(get_wakeup_time,
+ enabled, pending, tm);
+ spin_unlock_irqrestore(&rtc_lock, flags);
+ return status;
}
static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm)
{
- return efi_call_virt2(set_wakeup_time,
- enabled, tm);
+ unsigned long flags;
+ efi_status_t status;
+
+ spin_lock_irqsave(&rtc_lock, flags);
+ status = efi_call_virt2(set_wakeup_time,
+ enabled, tm);
+ spin_unlock_irqrestore(&rtc_lock, flags);
+ return status;
}
static efi_status_t virt_efi_get_variable(efi_char16_t *name,
@@ -122,7 +156,7 @@ static efi_status_t virt_efi_get_next_variable(unsigned long *name_size,
static efi_status_t virt_efi_set_variable(efi_char16_t *name,
efi_guid_t *vendor,
- unsigned long attr,
+ u32 attr,
unsigned long data_size,
void *data)
{
@@ -131,6 +165,18 @@ static efi_status_t virt_efi_set_variable(efi_char16_t *name,
data_size, data);
}
+static efi_status_t virt_efi_query_variable_info(u32 attr,
+ u64 *storage_space,
+ u64 *remaining_space,
+ u64 *max_variable_size)
+{
+ if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
+ return EFI_UNSUPPORTED;
+
+ return efi_call_virt4(query_variable_info, attr, storage_space,
+ remaining_space, max_variable_size);
+}
+
static efi_status_t virt_efi_get_next_high_mono_count(u32 *count)
{
return efi_call_virt1(get_next_high_mono_count, count);
@@ -145,15 +191,26 @@ static void virt_efi_reset_system(int reset_type,
data_size, data);
}
-static efi_status_t virt_efi_set_virtual_address_map(
- unsigned long memory_map_size,
- unsigned long descriptor_size,
- u32 descriptor_version,
- efi_memory_desc_t *virtual_map)
+static efi_status_t virt_efi_update_capsule(efi_capsule_header_t **capsules,
+ unsigned long count,
+ unsigned long sg_list)
{
- return efi_call_virt4(set_virtual_address_map,
- memory_map_size, descriptor_size,
- descriptor_version, virtual_map);
+ if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
+ return EFI_UNSUPPORTED;
+
+ return efi_call_virt3(update_capsule, capsules, count, sg_list);
+}
+
+static efi_status_t virt_efi_query_capsule_caps(efi_capsule_header_t **capsules,
+ unsigned long count,
+ u64 *max_size,
+ int *reset_type)
+{
+ if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
+ return EFI_UNSUPPORTED;
+
+ return efi_call_virt4(query_capsule_caps, capsules, count, max_size,
+ reset_type);
}
static efi_status_t __init phys_efi_set_virtual_address_map(
@@ -175,11 +232,14 @@ static efi_status_t __init phys_efi_set_virtual_address_map(
static efi_status_t __init phys_efi_get_time(efi_time_t *tm,
efi_time_cap_t *tc)
{
+ unsigned long flags;
efi_status_t status;
+ spin_lock_irqsave(&rtc_lock, flags);
efi_call_phys_prelog();
status = efi_call_phys2(efi_phys.get_time, tm, tc);
efi_call_phys_epilog();
+ spin_unlock_irqrestore(&rtc_lock, flags);
return status;
}
@@ -315,6 +375,61 @@ static void __init print_efi_memmap(void)
}
#endif /* EFI_DEBUG */
+void __init efi_reserve_boot_services(void)
+{
+ void *p;
+
+ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+ efi_memory_desc_t *md = p;
+ u64 start = md->phys_addr;
+ u64 size = md->num_pages << EFI_PAGE_SHIFT;
+
+ if (md->type != EFI_BOOT_SERVICES_CODE &&
+ md->type != EFI_BOOT_SERVICES_DATA)
+ continue;
+ /* Only reserve where possible:
+ * - Not within any already allocated areas
+ * - Not over any memory area (really needed, if above?)
+ * - Not within any part of the kernel
+ * - Not the bios reserved area
+ */
+ if ((start+size >= virt_to_phys(_text)
+ && start <= virt_to_phys(_end)) ||
+ !e820_all_mapped(start, start+size, E820_RAM) ||
+ memblock_x86_check_reserved_size(&start, &size,
+ 1<<EFI_PAGE_SHIFT)) {
+ /* Could not reserve, skip it */
+ md->num_pages = 0;
+ memblock_dbg(PFX "Could not reserve boot range "
+ "[0x%010llx-0x%010llx]\n",
+ start, start+size-1);
+ } else
+ memblock_x86_reserve_range(start, start+size,
+ "EFI Boot");
+ }
+}
+
+static void __init efi_free_boot_services(void)
+{
+ void *p;
+
+ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+ efi_memory_desc_t *md = p;
+ unsigned long long start = md->phys_addr;
+ unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
+
+ if (md->type != EFI_BOOT_SERVICES_CODE &&
+ md->type != EFI_BOOT_SERVICES_DATA)
+ continue;
+
+ /* Could not reserve boot area */
+ if (!size)
+ continue;
+
+ free_bootmem_late(start, size);
+ }
+}
+
void __init efi_init(void)
{
efi_config_table_t *config_tables;
@@ -460,19 +575,30 @@ void __init efi_init(void)
x86_platform.set_wallclock = efi_set_rtc_mmss;
#endif
- /* Setup for EFI runtime service */
- reboot_type = BOOT_EFI;
-
#if EFI_DEBUG
print_efi_memmap();
#endif
}
+void __init efi_set_executable(efi_memory_desc_t *md, bool executable)
+{
+ u64 addr, npages;
+
+ addr = md->virt_addr;
+ npages = md->num_pages;
+
+ memrange_efi_to_native(&addr, &npages);
+
+ if (executable)
+ set_memory_x(addr, npages);
+ else
+ set_memory_nx(addr, npages);
+}
+
static void __init runtime_code_page_mkexec(void)
{
efi_memory_desc_t *md;
void *p;
- u64 addr, npages;
/* Make EFI runtime service code area executable */
for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
@@ -481,10 +607,7 @@ static void __init runtime_code_page_mkexec(void)
if (md->type != EFI_RUNTIME_SERVICES_CODE)
continue;
- addr = md->virt_addr;
- npages = md->num_pages;
- memrange_efi_to_native(&addr, &npages);
- set_memory_x(addr, npages);
+ efi_set_executable(md, true);
}
}
@@ -498,16 +621,47 @@ static void __init runtime_code_page_mkexec(void)
*/
void __init efi_enter_virtual_mode(void)
{
- efi_memory_desc_t *md;
+ efi_memory_desc_t *md, *prev_md = NULL;
efi_status_t status;
unsigned long size;
u64 end, systab, addr, npages, end_pfn;
- void *p, *va;
+ void *p, *va, *new_memmap = NULL;
+ int count = 0;
efi.systab = NULL;
+
+ /* Merge contiguous regions of the same type and attribute */
+ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+ u64 prev_size;
+ md = p;
+
+ if (!prev_md) {
+ prev_md = md;
+ continue;
+ }
+
+ if (prev_md->type != md->type ||
+ prev_md->attribute != md->attribute) {
+ prev_md = md;
+ continue;
+ }
+
+ prev_size = prev_md->num_pages << EFI_PAGE_SHIFT;
+
+ if (md->phys_addr == (prev_md->phys_addr + prev_size)) {
+ prev_md->num_pages += md->num_pages;
+ md->type = EFI_RESERVED_TYPE;
+ md->attribute = 0;
+ continue;
+ }
+ prev_md = md;
+ }
+
for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
md = p;
- if (!(md->attribute & EFI_MEMORY_RUNTIME))
+ if (!(md->attribute & EFI_MEMORY_RUNTIME) &&
+ md->type != EFI_BOOT_SERVICES_CODE &&
+ md->type != EFI_BOOT_SERVICES_DATA)
continue;
size = md->num_pages << EFI_PAGE_SHIFT;
@@ -541,15 +695,21 @@ void __init efi_enter_virtual_mode(void)
systab += md->virt_addr - md->phys_addr;
efi.systab = (efi_system_table_t *) (unsigned long) systab;
}
+ new_memmap = krealloc(new_memmap,
+ (count + 1) * memmap.desc_size,
+ GFP_KERNEL);
+ memcpy(new_memmap + (count * memmap.desc_size), md,
+ memmap.desc_size);
+ count++;
}
BUG_ON(!efi.systab);
status = phys_efi_set_virtual_address_map(
- memmap.desc_size * memmap.nr_map,
+ memmap.desc_size * count,
memmap.desc_size,
memmap.desc_version,
- memmap.phys_map);
+ (efi_memory_desc_t *)__pa(new_memmap));
if (status != EFI_SUCCESS) {
printk(KERN_ALERT "Unable to switch EFI into virtual mode "
@@ -558,6 +718,13 @@ void __init efi_enter_virtual_mode(void)
}
/*
+ * Thankfully, it does seem that no runtime services other than
+ * SetVirtualAddressMap() will touch boot services code, so we can
+ * get rid of it all at this point
+ */
+ efi_free_boot_services();
+
+ /*
* Now that EFI is in virtual mode, update the function
* pointers in the runtime service table to the new virtual addresses.
*
@@ -572,11 +739,15 @@ void __init efi_enter_virtual_mode(void)
efi.set_variable = virt_efi_set_variable;
efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
efi.reset_system = virt_efi_reset_system;
- efi.set_virtual_address_map = virt_efi_set_virtual_address_map;
+ efi.set_virtual_address_map = NULL;
+ efi.query_variable_info = virt_efi_query_variable_info;
+ efi.update_capsule = virt_efi_update_capsule;
+ efi.query_capsule_caps = virt_efi_query_capsule_caps;
if (__supported_pte_mask & _PAGE_NX)
runtime_code_page_mkexec();
early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size);
memmap.map = NULL;
+ kfree(new_memmap);
}
/*
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index ac0621a7ac3..ac3aa54e265 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -41,22 +41,7 @@
static pgd_t save_pgd __initdata;
static unsigned long efi_flags __initdata;
-static void __init early_mapping_set_exec(unsigned long start,
- unsigned long end,
- int executable)
-{
- unsigned long num_pages;
-
- start &= PMD_MASK;
- end = (end + PMD_SIZE - 1) & PMD_MASK;
- num_pages = (end - start) >> PAGE_SHIFT;
- if (executable)
- set_memory_x((unsigned long)__va(start), num_pages);
- else
- set_memory_nx((unsigned long)__va(start), num_pages);
-}
-
-static void __init early_runtime_code_mapping_set_exec(int executable)
+static void __init early_code_mapping_set_exec(int executable)
{
efi_memory_desc_t *md;
void *p;
@@ -64,14 +49,12 @@ static void __init early_runtime_code_mapping_set_exec(int executable)
if (!(__supported_pte_mask & _PAGE_NX))
return;
- /* Make EFI runtime service code area executable */
+ /* Make EFI service code area executable */
for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
md = p;
- if (md->type == EFI_RUNTIME_SERVICES_CODE) {
- unsigned long end;
- end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
- early_mapping_set_exec(md->phys_addr, end, executable);
- }
+ if (md->type == EFI_RUNTIME_SERVICES_CODE ||
+ md->type == EFI_BOOT_SERVICES_CODE)
+ efi_set_executable(md, executable);
}
}
@@ -79,7 +62,7 @@ void __init efi_call_phys_prelog(void)
{
unsigned long vaddress;
- early_runtime_code_mapping_set_exec(1);
+ early_code_mapping_set_exec(1);
local_irq_save(efi_flags);
vaddress = (unsigned long)__va(0x0UL);
save_pgd = *pgd_offset_k(0x0UL);
@@ -95,7 +78,7 @@ void __init efi_call_phys_epilog(void)
set_pgd(pgd_offset_k(0x0UL), save_pgd);
__flush_tlb_all();
local_irq_restore(efi_flags);
- early_runtime_code_mapping_set_exec(0);
+ early_code_mapping_set_exec(0);
}
void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
@@ -107,8 +90,10 @@ void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
return ioremap(phys_addr, size);
last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
- if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size)
- return NULL;
+ if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) {
+ unsigned long top = last_map_pfn << PAGE_SHIFT;
+ efi_ioremap(top, size - (top - phys_addr), type);
+ }
return (void __iomem *)__va(phys_addr);
}
diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index 275dbc19e2c..7000e74b308 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -194,7 +194,7 @@ static unsigned long __init mrst_calibrate_tsc(void)
return 0;
}
-void __init mrst_time_init(void)
+static void __init mrst_time_init(void)
{
sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
switch (mrst_timer_options) {
@@ -216,7 +216,7 @@ void __init mrst_time_init(void)
apbt_time_init();
}
-void __cpuinit mrst_arch_setup(void)
+static void __cpuinit mrst_arch_setup(void)
{
if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27)
__mrst_cpu_chip = MRST_CPU_CHIP_PENWELL;
diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile
index c2a8cab65e5..fd332c53394 100644
--- a/arch/x86/platform/olpc/Makefile
+++ b/arch/x86/platform/olpc/Makefile
@@ -1,4 +1,5 @@
-obj-$(CONFIG_OLPC) += olpc.o
-obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o
-obj-$(CONFIG_OLPC) += olpc_ofw.o
-obj-$(CONFIG_OF_PROMTREE) += olpc_dt.o
+obj-$(CONFIG_OLPC) += olpc.o olpc_ofw.o olpc_dt.o
+obj-$(CONFIG_OLPC_XO1_PM) += olpc-xo1-pm.o xo1-wakeup.o
+obj-$(CONFIG_OLPC_XO1_RTC) += olpc-xo1-rtc.o
+obj-$(CONFIG_OLPC_XO1_SCI) += olpc-xo1-sci.o
+obj-$(CONFIG_OLPC_XO15_SCI) += olpc-xo15-sci.o
diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c
new file mode 100644
index 00000000000..6f3855a5a2f
--- /dev/null
+++ b/arch/x86/platform/olpc/olpc-xo1-pm.c
@@ -0,0 +1,215 @@
+/*
+ * Support for power management features of the OLPC XO-1 laptop
+ *
+ * Copyright (C) 2010 Andres Salomon <dilinger@queued.net>
+ * Copyright (C) 2010 One Laptop per Child
+ * Copyright (C) 2006 Red Hat, Inc.
+ * Copyright (C) 2006 Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/cs5535.h>
+#include <linux/platform_device.h>
+#include <linux/pm.h>
+#include <linux/mfd/core.h>
+#include <linux/suspend.h>
+
+#include <asm/io.h>
+#include <asm/olpc.h>
+
+#define DRV_NAME "olpc-xo1-pm"
+
+static unsigned long acpi_base;
+static unsigned long pms_base;
+
+static u16 wakeup_mask = CS5536_PM_PWRBTN;
+
+static struct {
+ unsigned long address;
+ unsigned short segment;
+} ofw_bios_entry = { 0xF0000 + PAGE_OFFSET, __KERNEL_CS };
+
+/* Set bits in the wakeup mask */
+void olpc_xo1_pm_wakeup_set(u16 value)
+{
+ wakeup_mask |= value;
+}
+EXPORT_SYMBOL_GPL(olpc_xo1_pm_wakeup_set);
+
+/* Clear bits in the wakeup mask */
+void olpc_xo1_pm_wakeup_clear(u16 value)
+{
+ wakeup_mask &= ~value;
+}
+EXPORT_SYMBOL_GPL(olpc_xo1_pm_wakeup_clear);
+
+static int xo1_power_state_enter(suspend_state_t pm_state)
+{
+ unsigned long saved_sci_mask;
+ int r;
+
+ /* Only STR is supported */
+ if (pm_state != PM_SUSPEND_MEM)
+ return -EINVAL;
+
+ r = olpc_ec_cmd(EC_SET_SCI_INHIBIT, NULL, 0, NULL, 0);
+ if (r)
+ return r;
+
+ /*
+ * Save SCI mask (this gets lost since PM1_EN is used as a mask for
+ * wakeup events, which is not necessarily the same event set)
+ */
+ saved_sci_mask = inl(acpi_base + CS5536_PM1_STS);
+ saved_sci_mask &= 0xffff0000;
+
+ /* Save CPU state */
+ do_olpc_suspend_lowlevel();
+
+ /* Resume path starts here */
+
+ /* Restore SCI mask (using dword access to CS5536_PM1_EN) */
+ outl(saved_sci_mask, acpi_base + CS5536_PM1_STS);
+
+ /* Tell the EC to stop inhibiting SCIs */
+ olpc_ec_cmd(EC_SET_SCI_INHIBIT_RELEASE, NULL, 0, NULL, 0);
+
+ /*
+ * Tell the wireless module to restart USB communication.
+ * Must be done twice.
+ */
+ olpc_ec_cmd(EC_WAKE_UP_WLAN, NULL, 0, NULL, 0);
+ olpc_ec_cmd(EC_WAKE_UP_WLAN, NULL, 0, NULL, 0);
+
+ return 0;
+}
+
+asmlinkage int xo1_do_sleep(u8 sleep_state)
+{
+ void *pgd_addr = __va(read_cr3());
+
+ /* Program wakeup mask (using dword access to CS5536_PM1_EN) */
+ outl(wakeup_mask << 16, acpi_base + CS5536_PM1_STS);
+
+ __asm__("movl %0,%%eax" : : "r" (pgd_addr));
+ __asm__("call *(%%edi); cld"
+ : : "D" (&ofw_bios_entry));
+ __asm__("movb $0x34, %al\n\t"
+ "outb %al, $0x70\n\t"
+ "movb $0x30, %al\n\t"
+ "outb %al, $0x71\n\t");
+ return 0;
+}
+
+static void xo1_power_off(void)
+{
+ printk(KERN_INFO "OLPC XO-1 power off sequence...\n");
+
+ /* Enable all of these controls with 0 delay */
+ outl(0x40000000, pms_base + CS5536_PM_SCLK);
+ outl(0x40000000, pms_base + CS5536_PM_IN_SLPCTL);
+ outl(0x40000000, pms_base + CS5536_PM_WKXD);
+ outl(0x40000000, pms_base + CS5536_PM_WKD);
+
+ /* Clear status bits (possibly unnecessary) */
+ outl(0x0002ffff, pms_base + CS5536_PM_SSC);
+ outl(0xffffffff, acpi_base + CS5536_PM_GPE0_STS);
+
+ /* Write SLP_EN bit to start the machinery */
+ outl(0x00002000, acpi_base + CS5536_PM1_CNT);
+}
+
+static int xo1_power_state_valid(suspend_state_t pm_state)
+{
+ /* suspend-to-RAM only */
+ return pm_state == PM_SUSPEND_MEM;
+}
+
+static const struct platform_suspend_ops xo1_suspend_ops = {
+ .valid = xo1_power_state_valid,
+ .enter = xo1_power_state_enter,
+};
+
+static int __devinit xo1_pm_probe(struct platform_device *pdev)
+{
+ struct resource *res;
+ int err;
+
+ /* don't run on non-XOs */
+ if (!machine_is_olpc())
+ return -ENODEV;
+
+ err = mfd_cell_enable(pdev);
+ if (err)
+ return err;
+
+ res = platform_get_resource(pdev, IORESOURCE_IO, 0);
+ if (!res) {
+ dev_err(&pdev->dev, "can't fetch device resource info\n");
+ return -EIO;
+ }
+ if (strcmp(pdev->name, "cs5535-pms") == 0)
+ pms_base = res->start;
+ else if (strcmp(pdev->name, "olpc-xo1-pm-acpi") == 0)
+ acpi_base = res->start;
+
+ /* If we have both addresses, we can override the poweroff hook */
+ if (pms_base && acpi_base) {
+ suspend_set_ops(&xo1_suspend_ops);
+ pm_power_off = xo1_power_off;
+ printk(KERN_INFO "OLPC XO-1 support registered\n");
+ }
+
+ return 0;
+}
+
+static int __devexit xo1_pm_remove(struct platform_device *pdev)
+{
+ mfd_cell_disable(pdev);
+
+ if (strcmp(pdev->name, "cs5535-pms") == 0)
+ pms_base = 0;
+ else if (strcmp(pdev->name, "olpc-xo1-pm-acpi") == 0)
+ acpi_base = 0;
+
+ pm_power_off = NULL;
+ return 0;
+}
+
+static struct platform_driver cs5535_pms_driver = {
+ .driver = {
+ .name = "cs5535-pms",
+ .owner = THIS_MODULE,
+ },
+ .probe = xo1_pm_probe,
+ .remove = __devexit_p(xo1_pm_remove),
+};
+
+static struct platform_driver cs5535_acpi_driver = {
+ .driver = {
+ .name = "olpc-xo1-pm-acpi",
+ .owner = THIS_MODULE,
+ },
+ .probe = xo1_pm_probe,
+ .remove = __devexit_p(xo1_pm_remove),
+};
+
+static int __init xo1_pm_init(void)
+{
+ int r;
+
+ r = platform_driver_register(&cs5535_pms_driver);
+ if (r)
+ return r;
+
+ r = platform_driver_register(&cs5535_acpi_driver);
+ if (r)
+ platform_driver_unregister(&cs5535_pms_driver);
+
+ return r;
+}
+arch_initcall(xo1_pm_init);
diff --git a/arch/x86/platform/olpc/olpc-xo1-rtc.c b/arch/x86/platform/olpc/olpc-xo1-rtc.c
new file mode 100644
index 00000000000..a2b4efddd61
--- /dev/null
+++ b/arch/x86/platform/olpc/olpc-xo1-rtc.c
@@ -0,0 +1,81 @@
+/*
+ * Support for OLPC XO-1 Real Time Clock (RTC)
+ *
+ * Copyright (C) 2011 One Laptop per Child
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/mc146818rtc.h>
+#include <linux/platform_device.h>
+#include <linux/rtc.h>
+#include <linux/of.h>
+
+#include <asm/msr.h>
+#include <asm/olpc.h>
+
+static void rtc_wake_on(struct device *dev)
+{
+ olpc_xo1_pm_wakeup_set(CS5536_PM_RTC);
+}
+
+static void rtc_wake_off(struct device *dev)
+{
+ olpc_xo1_pm_wakeup_clear(CS5536_PM_RTC);
+}
+
+static struct resource rtc_platform_resource[] = {
+ [0] = {
+ .start = RTC_PORT(0),
+ .end = RTC_PORT(1),
+ .flags = IORESOURCE_IO,
+ },
+ [1] = {
+ .start = RTC_IRQ,
+ .end = RTC_IRQ,
+ .flags = IORESOURCE_IRQ,
+ }
+};
+
+static struct cmos_rtc_board_info rtc_info = {
+ .rtc_day_alarm = 0,
+ .rtc_mon_alarm = 0,
+ .rtc_century = 0,
+ .wake_on = rtc_wake_on,
+ .wake_off = rtc_wake_off,
+};
+
+static struct platform_device xo1_rtc_device = {
+ .name = "rtc_cmos",
+ .id = -1,
+ .num_resources = ARRAY_SIZE(rtc_platform_resource),
+ .dev.platform_data = &rtc_info,
+ .resource = rtc_platform_resource,
+};
+
+static int __init xo1_rtc_init(void)
+{
+ int r;
+ struct device_node *node;
+
+ node = of_find_compatible_node(NULL, NULL, "olpc,xo1-rtc");
+ if (!node)
+ return 0;
+ of_node_put(node);
+
+ pr_info("olpc-xo1-rtc: Initializing OLPC XO-1 RTC\n");
+ rdmsrl(MSR_RTC_DOMA_OFFSET, rtc_info.rtc_day_alarm);
+ rdmsrl(MSR_RTC_MONA_OFFSET, rtc_info.rtc_mon_alarm);
+ rdmsrl(MSR_RTC_CEN_OFFSET, rtc_info.rtc_century);
+
+ r = platform_device_register(&xo1_rtc_device);
+ if (r)
+ return r;
+
+ device_init_wakeup(&xo1_rtc_device.dev, 1);
+ return 0;
+}
+arch_initcall(xo1_rtc_init);
diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c
new file mode 100644
index 00000000000..1d4c783d732
--- /dev/null
+++ b/arch/x86/platform/olpc/olpc-xo1-sci.c
@@ -0,0 +1,614 @@
+/*
+ * Support for OLPC XO-1 System Control Interrupts (SCI)
+ *
+ * Copyright (C) 2010 One Laptop per Child
+ * Copyright (C) 2006 Red Hat, Inc.
+ * Copyright (C) 2006 Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/cs5535.h>
+#include <linux/device.h>
+#include <linux/gpio.h>
+#include <linux/input.h>
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+#include <linux/pm.h>
+#include <linux/mfd/core.h>
+#include <linux/power_supply.h>
+#include <linux/suspend.h>
+#include <linux/workqueue.h>
+
+#include <asm/io.h>
+#include <asm/msr.h>
+#include <asm/olpc.h>
+
+#define DRV_NAME "olpc-xo1-sci"
+#define PFX DRV_NAME ": "
+
+static unsigned long acpi_base;
+static struct input_dev *power_button_idev;
+static struct input_dev *ebook_switch_idev;
+static struct input_dev *lid_switch_idev;
+
+static int sci_irq;
+
+static bool lid_open;
+static bool lid_inverted;
+static int lid_wake_mode;
+
+enum lid_wake_modes {
+ LID_WAKE_ALWAYS,
+ LID_WAKE_OPEN,
+ LID_WAKE_CLOSE,
+};
+
+static const char * const lid_wake_mode_names[] = {
+ [LID_WAKE_ALWAYS] = "always",
+ [LID_WAKE_OPEN] = "open",
+ [LID_WAKE_CLOSE] = "close",
+};
+
+static void battery_status_changed(void)
+{
+ struct power_supply *psy = power_supply_get_by_name("olpc-battery");
+
+ if (psy) {
+ power_supply_changed(psy);
+ put_device(psy->dev);
+ }
+}
+
+static void ac_status_changed(void)
+{
+ struct power_supply *psy = power_supply_get_by_name("olpc-ac");
+
+ if (psy) {
+ power_supply_changed(psy);
+ put_device(psy->dev);
+ }
+}
+
+/* Report current ebook switch state through input layer */
+static void send_ebook_state(void)
+{
+ unsigned char state;
+
+ if (olpc_ec_cmd(EC_READ_EB_MODE, NULL, 0, &state, 1)) {
+ pr_err(PFX "failed to get ebook state\n");
+ return;
+ }
+
+ input_report_switch(ebook_switch_idev, SW_TABLET_MODE, state);
+ input_sync(ebook_switch_idev);
+}
+
+static void flip_lid_inverter(void)
+{
+ /* gpio is high; invert so we'll get l->h event interrupt */
+ if (lid_inverted)
+ cs5535_gpio_clear(OLPC_GPIO_LID, GPIO_INPUT_INVERT);
+ else
+ cs5535_gpio_set(OLPC_GPIO_LID, GPIO_INPUT_INVERT);
+ lid_inverted = !lid_inverted;
+}
+
+static void detect_lid_state(void)
+{
+ /*
+ * the edge detector hookup on the gpio inputs on the geode is
+ * odd, to say the least. See http://dev.laptop.org/ticket/5703
+ * for details, but in a nutshell: we don't use the edge
+ * detectors. instead, we make use of an anomoly: with the both
+ * edge detectors turned off, we still get an edge event on a
+ * positive edge transition. to take advantage of this, we use the
+ * front-end inverter to ensure that that's the edge we're always
+ * going to see next.
+ */
+
+ int state;
+
+ state = cs5535_gpio_isset(OLPC_GPIO_LID, GPIO_READ_BACK);
+ lid_open = !state ^ !lid_inverted; /* x ^^ y */
+ if (!state)
+ return;
+
+ flip_lid_inverter();
+}
+
+/* Report current lid switch state through input layer */
+static void send_lid_state(void)
+{
+ input_report_switch(lid_switch_idev, SW_LID, !lid_open);
+ input_sync(lid_switch_idev);
+}
+
+static ssize_t lid_wake_mode_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ const char *mode = lid_wake_mode_names[lid_wake_mode];
+ return sprintf(buf, "%s\n", mode);
+}
+static ssize_t lid_wake_mode_set(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ int i;
+ for (i = 0; i < ARRAY_SIZE(lid_wake_mode_names); i++) {
+ const char *mode = lid_wake_mode_names[i];
+ if (strlen(mode) != count || strncasecmp(mode, buf, count))
+ continue;
+
+ lid_wake_mode = i;
+ return count;
+ }
+ return -EINVAL;
+}
+static DEVICE_ATTR(lid_wake_mode, S_IWUSR | S_IRUGO, lid_wake_mode_show,
+ lid_wake_mode_set);
+
+/*
+ * Process all items in the EC's SCI queue.
+ *
+ * This is handled in a workqueue because olpc_ec_cmd can be slow (and
+ * can even timeout).
+ *
+ * If propagate_events is false, the queue is drained without events being
+ * generated for the interrupts.
+ */
+static void process_sci_queue(bool propagate_events)
+{
+ int r;
+ u16 data;
+
+ do {
+ r = olpc_ec_sci_query(&data);
+ if (r || !data)
+ break;
+
+ pr_debug(PFX "SCI 0x%x received\n", data);
+
+ switch (data) {
+ case EC_SCI_SRC_BATERR:
+ case EC_SCI_SRC_BATSOC:
+ case EC_SCI_SRC_BATTERY:
+ case EC_SCI_SRC_BATCRIT:
+ battery_status_changed();
+ break;
+ case EC_SCI_SRC_ACPWR:
+ ac_status_changed();
+ break;
+ }
+
+ if (data == EC_SCI_SRC_EBOOK && propagate_events)
+ send_ebook_state();
+ } while (data);
+
+ if (r)
+ pr_err(PFX "Failed to clear SCI queue");
+}
+
+static void process_sci_queue_work(struct work_struct *work)
+{
+ process_sci_queue(true);
+}
+
+static DECLARE_WORK(sci_work, process_sci_queue_work);
+
+static irqreturn_t xo1_sci_intr(int irq, void *dev_id)
+{
+ struct platform_device *pdev = dev_id;
+ u32 sts;
+ u32 gpe;
+
+ sts = inl(acpi_base + CS5536_PM1_STS);
+ outl(sts | 0xffff, acpi_base + CS5536_PM1_STS);
+
+ gpe = inl(acpi_base + CS5536_PM_GPE0_STS);
+ outl(0xffffffff, acpi_base + CS5536_PM_GPE0_STS);
+
+ dev_dbg(&pdev->dev, "sts %x gpe %x\n", sts, gpe);
+
+ if (sts & CS5536_PWRBTN_FLAG && !(sts & CS5536_WAK_FLAG)) {
+ input_report_key(power_button_idev, KEY_POWER, 1);
+ input_sync(power_button_idev);
+ input_report_key(power_button_idev, KEY_POWER, 0);
+ input_sync(power_button_idev);
+ }
+
+ if (gpe & CS5536_GPIOM7_PME_FLAG) { /* EC GPIO */
+ cs5535_gpio_set(OLPC_GPIO_ECSCI, GPIO_NEGATIVE_EDGE_STS);
+ schedule_work(&sci_work);
+ }
+
+ cs5535_gpio_set(OLPC_GPIO_LID, GPIO_NEGATIVE_EDGE_STS);
+ cs5535_gpio_set(OLPC_GPIO_LID, GPIO_POSITIVE_EDGE_STS);
+ detect_lid_state();
+ send_lid_state();
+
+ return IRQ_HANDLED;
+}
+
+static int xo1_sci_suspend(struct platform_device *pdev, pm_message_t state)
+{
+ if (device_may_wakeup(&power_button_idev->dev))
+ olpc_xo1_pm_wakeup_set(CS5536_PM_PWRBTN);
+ else
+ olpc_xo1_pm_wakeup_clear(CS5536_PM_PWRBTN);
+
+ if (device_may_wakeup(&ebook_switch_idev->dev))
+ olpc_ec_wakeup_set(EC_SCI_SRC_EBOOK);
+ else
+ olpc_ec_wakeup_clear(EC_SCI_SRC_EBOOK);
+
+ if (!device_may_wakeup(&lid_switch_idev->dev)) {
+ cs5535_gpio_clear(OLPC_GPIO_LID, GPIO_EVENTS_ENABLE);
+ } else if ((lid_open && lid_wake_mode == LID_WAKE_OPEN) ||
+ (!lid_open && lid_wake_mode == LID_WAKE_CLOSE)) {
+ flip_lid_inverter();
+
+ /* we may have just caused an event */
+ cs5535_gpio_set(OLPC_GPIO_LID, GPIO_NEGATIVE_EDGE_STS);
+ cs5535_gpio_set(OLPC_GPIO_LID, GPIO_POSITIVE_EDGE_STS);
+
+ cs5535_gpio_set(OLPC_GPIO_LID, GPIO_EVENTS_ENABLE);
+ }
+
+ return 0;
+}
+
+static int xo1_sci_resume(struct platform_device *pdev)
+{
+ /*
+ * We don't know what may have happened while we were asleep.
+ * Reestablish our lid setup so we're sure to catch all transitions.
+ */
+ detect_lid_state();
+ send_lid_state();
+ cs5535_gpio_set(OLPC_GPIO_LID, GPIO_EVENTS_ENABLE);
+
+ /* Enable all EC events */
+ olpc_ec_mask_write(EC_SCI_SRC_ALL);
+
+ /* Power/battery status might have changed too */
+ battery_status_changed();
+ ac_status_changed();
+ return 0;
+}
+
+static int __devinit setup_sci_interrupt(struct platform_device *pdev)
+{
+ u32 lo, hi;
+ u32 sts;
+ int r;
+
+ rdmsr(0x51400020, lo, hi);
+ sci_irq = (lo >> 20) & 15;
+
+ if (sci_irq) {
+ dev_info(&pdev->dev, "SCI is mapped to IRQ %d\n", sci_irq);
+ } else {
+ /* Zero means masked */
+ dev_info(&pdev->dev, "SCI unmapped. Mapping to IRQ 3\n");
+ sci_irq = 3;
+ lo |= 0x00300000;
+ wrmsrl(0x51400020, lo);
+ }
+
+ /* Select level triggered in PIC */
+ if (sci_irq < 8) {
+ lo = inb(CS5536_PIC_INT_SEL1);
+ lo |= 1 << sci_irq;
+ outb(lo, CS5536_PIC_INT_SEL1);
+ } else {
+ lo = inb(CS5536_PIC_INT_SEL2);
+ lo |= 1 << (sci_irq - 8);
+ outb(lo, CS5536_PIC_INT_SEL2);
+ }
+
+ /* Enable SCI from power button, and clear pending interrupts */
+ sts = inl(acpi_base + CS5536_PM1_STS);
+ outl((CS5536_PM_PWRBTN << 16) | 0xffff, acpi_base + CS5536_PM1_STS);
+
+ r = request_irq(sci_irq, xo1_sci_intr, 0, DRV_NAME, pdev);
+ if (r)
+ dev_err(&pdev->dev, "can't request interrupt\n");
+
+ return r;
+}
+
+static int __devinit setup_ec_sci(void)
+{
+ int r;
+
+ r = gpio_request(OLPC_GPIO_ECSCI, "OLPC-ECSCI");
+ if (r)
+ return r;
+
+ gpio_direction_input(OLPC_GPIO_ECSCI);
+
+ /* Clear pending EC SCI events */
+ cs5535_gpio_set(OLPC_GPIO_ECSCI, GPIO_NEGATIVE_EDGE_STS);
+ cs5535_gpio_set(OLPC_GPIO_ECSCI, GPIO_POSITIVE_EDGE_STS);
+
+ /*
+ * Enable EC SCI events, and map them to both a PME and the SCI
+ * interrupt.
+ *
+ * Ordinarily, in addition to functioning as GPIOs, Geode GPIOs can
+ * be mapped to regular interrupts *or* Geode-specific Power
+ * Management Events (PMEs) - events that bring the system out of
+ * suspend. In this case, we want both of those things - the system
+ * wakeup, *and* the ability to get an interrupt when an event occurs.
+ *
+ * To achieve this, we map the GPIO to a PME, and then we use one
+ * of the many generic knobs on the CS5535 PIC to additionally map the
+ * PME to the regular SCI interrupt line.
+ */
+ cs5535_gpio_set(OLPC_GPIO_ECSCI, GPIO_EVENTS_ENABLE);
+
+ /* Set the SCI to cause a PME event on group 7 */
+ cs5535_gpio_setup_event(OLPC_GPIO_ECSCI, 7, 1);
+
+ /* And have group 7 also fire the SCI interrupt */
+ cs5535_pic_unreqz_select_high(7, sci_irq);
+
+ return 0;
+}
+
+static void free_ec_sci(void)
+{
+ gpio_free(OLPC_GPIO_ECSCI);
+}
+
+static int __devinit setup_lid_events(void)
+{
+ int r;
+
+ r = gpio_request(OLPC_GPIO_LID, "OLPC-LID");
+ if (r)
+ return r;
+
+ gpio_direction_input(OLPC_GPIO_LID);
+
+ cs5535_gpio_clear(OLPC_GPIO_LID, GPIO_INPUT_INVERT);
+ lid_inverted = 0;
+
+ /* Clear edge detection and event enable for now */
+ cs5535_gpio_clear(OLPC_GPIO_LID, GPIO_EVENTS_ENABLE);
+ cs5535_gpio_clear(OLPC_GPIO_LID, GPIO_NEGATIVE_EDGE_EN);
+ cs5535_gpio_clear(OLPC_GPIO_LID, GPIO_POSITIVE_EDGE_EN);
+ cs5535_gpio_set(OLPC_GPIO_LID, GPIO_NEGATIVE_EDGE_STS);
+ cs5535_gpio_set(OLPC_GPIO_LID, GPIO_POSITIVE_EDGE_STS);
+
+ /* Set the LID to cause an PME event on group 6 */
+ cs5535_gpio_setup_event(OLPC_GPIO_LID, 6, 1);
+
+ /* Set PME group 6 to fire the SCI interrupt */
+ cs5535_gpio_set_irq(6, sci_irq);
+
+ /* Enable the event */
+ cs5535_gpio_set(OLPC_GPIO_LID, GPIO_EVENTS_ENABLE);
+
+ return 0;
+}
+
+static void free_lid_events(void)
+{
+ gpio_free(OLPC_GPIO_LID);
+}
+
+static int __devinit setup_power_button(struct platform_device *pdev)
+{
+ int r;
+
+ power_button_idev = input_allocate_device();
+ if (!power_button_idev)
+ return -ENOMEM;
+
+ power_button_idev->name = "Power Button";
+ power_button_idev->phys = DRV_NAME "/input0";
+ set_bit(EV_KEY, power_button_idev->evbit);
+ set_bit(KEY_POWER, power_button_idev->keybit);
+
+ power_button_idev->dev.parent = &pdev->dev;
+ device_init_wakeup(&power_button_idev->dev, 1);
+
+ r = input_register_device(power_button_idev);
+ if (r) {
+ dev_err(&pdev->dev, "failed to register power button: %d\n", r);
+ input_free_device(power_button_idev);
+ }
+
+ return r;
+}
+
+static void free_power_button(void)
+{
+ input_unregister_device(power_button_idev);
+ input_free_device(power_button_idev);
+}
+
+static int __devinit setup_ebook_switch(struct platform_device *pdev)
+{
+ int r;
+
+ ebook_switch_idev = input_allocate_device();
+ if (!ebook_switch_idev)
+ return -ENOMEM;
+
+ ebook_switch_idev->name = "EBook Switch";
+ ebook_switch_idev->phys = DRV_NAME "/input1";
+ set_bit(EV_SW, ebook_switch_idev->evbit);
+ set_bit(SW_TABLET_MODE, ebook_switch_idev->swbit);
+
+ ebook_switch_idev->dev.parent = &pdev->dev;
+ device_set_wakeup_capable(&ebook_switch_idev->dev, true);
+
+ r = input_register_device(ebook_switch_idev);
+ if (r) {
+ dev_err(&pdev->dev, "failed to register ebook switch: %d\n", r);
+ input_free_device(ebook_switch_idev);
+ }
+
+ return r;
+}
+
+static void free_ebook_switch(void)
+{
+ input_unregister_device(ebook_switch_idev);
+ input_free_device(ebook_switch_idev);
+}
+
+static int __devinit setup_lid_switch(struct platform_device *pdev)
+{
+ int r;
+
+ lid_switch_idev = input_allocate_device();
+ if (!lid_switch_idev)
+ return -ENOMEM;
+
+ lid_switch_idev->name = "Lid Switch";
+ lid_switch_idev->phys = DRV_NAME "/input2";
+ set_bit(EV_SW, lid_switch_idev->evbit);
+ set_bit(SW_LID, lid_switch_idev->swbit);
+
+ lid_switch_idev->dev.parent = &pdev->dev;
+ device_set_wakeup_capable(&lid_switch_idev->dev, true);
+
+ r = input_register_device(lid_switch_idev);
+ if (r) {
+ dev_err(&pdev->dev, "failed to register lid switch: %d\n", r);
+ goto err_register;
+ }
+
+ r = device_create_file(&lid_switch_idev->dev, &dev_attr_lid_wake_mode);
+ if (r) {
+ dev_err(&pdev->dev, "failed to create wake mode attr: %d\n", r);
+ goto err_create_attr;
+ }
+
+ return 0;
+
+err_create_attr:
+ input_unregister_device(lid_switch_idev);
+err_register:
+ input_free_device(lid_switch_idev);
+ return r;
+}
+
+static void free_lid_switch(void)
+{
+ device_remove_file(&lid_switch_idev->dev, &dev_attr_lid_wake_mode);
+ input_unregister_device(lid_switch_idev);
+ input_free_device(lid_switch_idev);
+}
+
+static int __devinit xo1_sci_probe(struct platform_device *pdev)
+{
+ struct resource *res;
+ int r;
+
+ /* don't run on non-XOs */
+ if (!machine_is_olpc())
+ return -ENODEV;
+
+ r = mfd_cell_enable(pdev);
+ if (r)
+ return r;
+
+ res = platform_get_resource(pdev, IORESOURCE_IO, 0);
+ if (!res) {
+ dev_err(&pdev->dev, "can't fetch device resource info\n");
+ return -EIO;
+ }
+ acpi_base = res->start;
+
+ r = setup_power_button(pdev);
+ if (r)
+ return r;
+
+ r = setup_ebook_switch(pdev);
+ if (r)
+ goto err_ebook;
+
+ r = setup_lid_switch(pdev);
+ if (r)
+ goto err_lid;
+
+ r = setup_lid_events();
+ if (r)
+ goto err_lidevt;
+
+ r = setup_ec_sci();
+ if (r)
+ goto err_ecsci;
+
+ /* Enable PME generation for EC-generated events */
+ outl(CS5536_GPIOM6_PME_EN | CS5536_GPIOM7_PME_EN,
+ acpi_base + CS5536_PM_GPE0_EN);
+
+ /* Clear pending events */
+ outl(0xffffffff, acpi_base + CS5536_PM_GPE0_STS);
+ process_sci_queue(false);
+
+ /* Initial sync */
+ send_ebook_state();
+ detect_lid_state();
+ send_lid_state();
+
+ r = setup_sci_interrupt(pdev);
+ if (r)
+ goto err_sci;
+
+ /* Enable all EC events */
+ olpc_ec_mask_write(EC_SCI_SRC_ALL);
+
+ return r;
+
+err_sci:
+ free_ec_sci();
+err_ecsci:
+ free_lid_events();
+err_lidevt:
+ free_lid_switch();
+err_lid:
+ free_ebook_switch();
+err_ebook:
+ free_power_button();
+ return r;
+}
+
+static int __devexit xo1_sci_remove(struct platform_device *pdev)
+{
+ mfd_cell_disable(pdev);
+ free_irq(sci_irq, pdev);
+ cancel_work_sync(&sci_work);
+ free_ec_sci();
+ free_lid_events();
+ free_lid_switch();
+ free_ebook_switch();
+ free_power_button();
+ acpi_base = 0;
+ return 0;
+}
+
+static struct platform_driver xo1_sci_driver = {
+ .driver = {
+ .name = "olpc-xo1-sci-acpi",
+ },
+ .probe = xo1_sci_probe,
+ .remove = __devexit_p(xo1_sci_remove),
+ .suspend = xo1_sci_suspend,
+ .resume = xo1_sci_resume,
+};
+
+static int __init xo1_sci_init(void)
+{
+ return platform_driver_register(&xo1_sci_driver);
+}
+arch_initcall(xo1_sci_init);
diff --git a/arch/x86/platform/olpc/olpc-xo1.c b/arch/x86/platform/olpc/olpc-xo1.c
deleted file mode 100644
index ab81fb27176..00000000000
--- a/arch/x86/platform/olpc/olpc-xo1.c
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * Support for features of the OLPC XO-1 laptop
- *
- * Copyright (C) 2010 Andres Salomon <dilinger@queued.net>
- * Copyright (C) 2010 One Laptop per Child
- * Copyright (C) 2006 Red Hat, Inc.
- * Copyright (C) 2006 Advanced Micro Devices, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <linux/module.h>
-#include <linux/platform_device.h>
-#include <linux/pm.h>
-#include <linux/mfd/core.h>
-
-#include <asm/io.h>
-#include <asm/olpc.h>
-
-#define DRV_NAME "olpc-xo1"
-
-/* PMC registers (PMS block) */
-#define PM_SCLK 0x10
-#define PM_IN_SLPCTL 0x20
-#define PM_WKXD 0x34
-#define PM_WKD 0x30
-#define PM_SSC 0x54
-
-/* PM registers (ACPI block) */
-#define PM1_CNT 0x08
-#define PM_GPE0_STS 0x18
-
-static unsigned long acpi_base;
-static unsigned long pms_base;
-
-static void xo1_power_off(void)
-{
- printk(KERN_INFO "OLPC XO-1 power off sequence...\n");
-
- /* Enable all of these controls with 0 delay */
- outl(0x40000000, pms_base + PM_SCLK);
- outl(0x40000000, pms_base + PM_IN_SLPCTL);
- outl(0x40000000, pms_base + PM_WKXD);
- outl(0x40000000, pms_base + PM_WKD);
-
- /* Clear status bits (possibly unnecessary) */
- outl(0x0002ffff, pms_base + PM_SSC);
- outl(0xffffffff, acpi_base + PM_GPE0_STS);
-
- /* Write SLP_EN bit to start the machinery */
- outl(0x00002000, acpi_base + PM1_CNT);
-}
-
-static int __devinit olpc_xo1_probe(struct platform_device *pdev)
-{
- struct resource *res;
- int err;
-
- /* don't run on non-XOs */
- if (!machine_is_olpc())
- return -ENODEV;
-
- err = mfd_cell_enable(pdev);
- if (err)
- return err;
-
- res = platform_get_resource(pdev, IORESOURCE_IO, 0);
- if (!res) {
- dev_err(&pdev->dev, "can't fetch device resource info\n");
- return -EIO;
- }
- if (strcmp(pdev->name, "cs5535-pms") == 0)
- pms_base = res->start;
- else if (strcmp(pdev->name, "olpc-xo1-pm-acpi") == 0)
- acpi_base = res->start;
-
- /* If we have both addresses, we can override the poweroff hook */
- if (pms_base && acpi_base) {
- pm_power_off = xo1_power_off;
- printk(KERN_INFO "OLPC XO-1 support registered\n");
- }
-
- return 0;
-}
-
-static int __devexit olpc_xo1_remove(struct platform_device *pdev)
-{
- mfd_cell_disable(pdev);
-
- if (strcmp(pdev->name, "cs5535-pms") == 0)
- pms_base = 0;
- else if (strcmp(pdev->name, "olpc-xo1-pm-acpi") == 0)
- acpi_base = 0;
-
- pm_power_off = NULL;
- return 0;
-}
-
-static struct platform_driver cs5535_pms_drv = {
- .driver = {
- .name = "cs5535-pms",
- .owner = THIS_MODULE,
- },
- .probe = olpc_xo1_probe,
- .remove = __devexit_p(olpc_xo1_remove),
-};
-
-static struct platform_driver cs5535_acpi_drv = {
- .driver = {
- .name = "olpc-xo1-pm-acpi",
- .owner = THIS_MODULE,
- },
- .probe = olpc_xo1_probe,
- .remove = __devexit_p(olpc_xo1_remove),
-};
-
-static int __init olpc_xo1_init(void)
-{
- int r;
-
- r = platform_driver_register(&cs5535_pms_drv);
- if (r)
- return r;
-
- r = platform_driver_register(&cs5535_acpi_drv);
- if (r)
- platform_driver_unregister(&cs5535_pms_drv);
-
- return r;
-}
-
-static void __exit olpc_xo1_exit(void)
-{
- platform_driver_unregister(&cs5535_acpi_drv);
- platform_driver_unregister(&cs5535_pms_drv);
-}
-
-MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("platform:cs5535-pms");
-
-module_init(olpc_xo1_init);
-module_exit(olpc_xo1_exit);
diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c
new file mode 100644
index 00000000000..2b235b77d9a
--- /dev/null
+++ b/arch/x86/platform/olpc/olpc-xo15-sci.c
@@ -0,0 +1,168 @@
+/*
+ * Support for OLPC XO-1.5 System Control Interrupts (SCI)
+ *
+ * Copyright (C) 2009-2010 One Laptop per Child
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/power_supply.h>
+
+#include <acpi/acpi_bus.h>
+#include <acpi/acpi_drivers.h>
+#include <asm/olpc.h>
+
+#define DRV_NAME "olpc-xo15-sci"
+#define PFX DRV_NAME ": "
+#define XO15_SCI_CLASS DRV_NAME
+#define XO15_SCI_DEVICE_NAME "OLPC XO-1.5 SCI"
+
+static unsigned long xo15_sci_gpe;
+
+static void battery_status_changed(void)
+{
+ struct power_supply *psy = power_supply_get_by_name("olpc-battery");
+
+ if (psy) {
+ power_supply_changed(psy);
+ put_device(psy->dev);
+ }
+}
+
+static void ac_status_changed(void)
+{
+ struct power_supply *psy = power_supply_get_by_name("olpc-ac");
+
+ if (psy) {
+ power_supply_changed(psy);
+ put_device(psy->dev);
+ }
+}
+
+static void process_sci_queue(void)
+{
+ u16 data;
+ int r;
+
+ do {
+ r = olpc_ec_sci_query(&data);
+ if (r || !data)
+ break;
+
+ pr_debug(PFX "SCI 0x%x received\n", data);
+
+ switch (data) {
+ case EC_SCI_SRC_BATERR:
+ case EC_SCI_SRC_BATSOC:
+ case EC_SCI_SRC_BATTERY:
+ case EC_SCI_SRC_BATCRIT:
+ battery_status_changed();
+ break;
+ case EC_SCI_SRC_ACPWR:
+ ac_status_changed();
+ break;
+ }
+ } while (data);
+
+ if (r)
+ pr_err(PFX "Failed to clear SCI queue");
+}
+
+static void process_sci_queue_work(struct work_struct *work)
+{
+ process_sci_queue();
+}
+
+static DECLARE_WORK(sci_work, process_sci_queue_work);
+
+static u32 xo15_sci_gpe_handler(acpi_handle gpe_device, u32 gpe, void *context)
+{
+ schedule_work(&sci_work);
+ return ACPI_INTERRUPT_HANDLED | ACPI_REENABLE_GPE;
+}
+
+static int xo15_sci_add(struct acpi_device *device)
+{
+ unsigned long long tmp;
+ acpi_status status;
+
+ if (!device)
+ return -EINVAL;
+
+ strcpy(acpi_device_name(device), XO15_SCI_DEVICE_NAME);
+ strcpy(acpi_device_class(device), XO15_SCI_CLASS);
+
+ /* Get GPE bit assignment (EC events). */
+ status = acpi_evaluate_integer(device->handle, "_GPE", NULL, &tmp);
+ if (ACPI_FAILURE(status))
+ return -EINVAL;
+
+ xo15_sci_gpe = tmp;
+ status = acpi_install_gpe_handler(NULL, xo15_sci_gpe,
+ ACPI_GPE_EDGE_TRIGGERED,
+ xo15_sci_gpe_handler, device);
+ if (ACPI_FAILURE(status))
+ return -ENODEV;
+
+ dev_info(&device->dev, "Initialized, GPE = 0x%lx\n", xo15_sci_gpe);
+
+ /* Flush queue, and enable all SCI events */
+ process_sci_queue();
+ olpc_ec_mask_write(EC_SCI_SRC_ALL);
+
+ acpi_enable_gpe(NULL, xo15_sci_gpe);
+
+ /* Enable wake-on-EC */
+ if (device->wakeup.flags.valid)
+ device_init_wakeup(&device->dev, true);
+
+ return 0;
+}
+
+static int xo15_sci_remove(struct acpi_device *device, int type)
+{
+ acpi_disable_gpe(NULL, xo15_sci_gpe);
+ acpi_remove_gpe_handler(NULL, xo15_sci_gpe, xo15_sci_gpe_handler);
+ cancel_work_sync(&sci_work);
+ return 0;
+}
+
+static int xo15_sci_resume(struct acpi_device *device)
+{
+ /* Enable all EC events */
+ olpc_ec_mask_write(EC_SCI_SRC_ALL);
+
+ /* Power/battery status might have changed */
+ battery_status_changed();
+ ac_status_changed();
+
+ return 0;
+}
+
+static const struct acpi_device_id xo15_sci_device_ids[] = {
+ {"XO15EC", 0},
+ {"", 0},
+};
+
+static struct acpi_driver xo15_sci_drv = {
+ .name = DRV_NAME,
+ .class = XO15_SCI_CLASS,
+ .ids = xo15_sci_device_ids,
+ .ops = {
+ .add = xo15_sci_add,
+ .remove = xo15_sci_remove,
+ .resume = xo15_sci_resume,
+ },
+};
+
+static int __init xo15_sci_init(void)
+{
+ return acpi_bus_register_driver(&xo15_sci_drv);
+}
+device_initcall(xo15_sci_init);
diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c
index edaf3fe8dc5..8b9940e78e2 100644
--- a/arch/x86/platform/olpc/olpc.c
+++ b/arch/x86/platform/olpc/olpc.c
@@ -18,6 +18,8 @@
#include <linux/io.h>
#include <linux/string.h>
#include <linux/platform_device.h>
+#include <linux/of.h>
+#include <linux/syscore_ops.h>
#include <asm/geode.h>
#include <asm/setup.h>
@@ -29,6 +31,9 @@ EXPORT_SYMBOL_GPL(olpc_platform_info);
static DEFINE_SPINLOCK(ec_lock);
+/* EC event mask to be applied during suspend (defining wakeup sources). */
+static u16 ec_wakeup_mask;
+
/* what the timeout *should* be (in ms) */
#define EC_BASE_TIMEOUT 20
@@ -187,41 +192,125 @@ err:
}
EXPORT_SYMBOL_GPL(olpc_ec_cmd);
-static bool __init check_ofw_architecture(void)
+void olpc_ec_wakeup_set(u16 value)
+{
+ ec_wakeup_mask |= value;
+}
+EXPORT_SYMBOL_GPL(olpc_ec_wakeup_set);
+
+void olpc_ec_wakeup_clear(u16 value)
{
- size_t propsize;
- char olpc_arch[5];
- const void *args[] = { NULL, "architecture", olpc_arch, (void *)5 };
- void *res[] = { &propsize };
+ ec_wakeup_mask &= ~value;
+}
+EXPORT_SYMBOL_GPL(olpc_ec_wakeup_clear);
- if (olpc_ofw("getprop", args, res)) {
- printk(KERN_ERR "ofw: getprop call failed!\n");
+/*
+ * Returns true if the compile and runtime configurations allow for EC events
+ * to wake the system.
+ */
+bool olpc_ec_wakeup_available(void)
+{
+ if (!machine_is_olpc())
return false;
+
+ /*
+ * XO-1 EC wakeups are available when olpc-xo1-sci driver is
+ * compiled in
+ */
+#ifdef CONFIG_OLPC_XO1_SCI
+ if (olpc_platform_info.boardrev < olpc_board_pre(0xd0)) /* XO-1 */
+ return true;
+#endif
+
+ /*
+ * XO-1.5 EC wakeups are available when olpc-xo15-sci driver is
+ * compiled in
+ */
+#ifdef CONFIG_OLPC_XO15_SCI
+ if (olpc_platform_info.boardrev >= olpc_board_pre(0xd0)) /* XO-1.5 */
+ return true;
+#endif
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(olpc_ec_wakeup_available);
+
+int olpc_ec_mask_write(u16 bits)
+{
+ if (olpc_platform_info.flags & OLPC_F_EC_WIDE_SCI) {
+ __be16 ec_word = cpu_to_be16(bits);
+ return olpc_ec_cmd(EC_WRITE_EXT_SCI_MASK, (void *) &ec_word, 2,
+ NULL, 0);
+ } else {
+ unsigned char ec_byte = bits & 0xff;
+ return olpc_ec_cmd(EC_WRITE_SCI_MASK, &ec_byte, 1, NULL, 0);
}
- return propsize == 5 && strncmp("OLPC", olpc_arch, 5) == 0;
}
+EXPORT_SYMBOL_GPL(olpc_ec_mask_write);
-static u32 __init get_board_revision(void)
+int olpc_ec_sci_query(u16 *sci_value)
{
- size_t propsize;
- __be32 rev;
- const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 };
- void *res[] = { &propsize };
-
- if (olpc_ofw("getprop", args, res) || propsize != 4) {
- printk(KERN_ERR "ofw: getprop call failed!\n");
- return cpu_to_be32(0);
+ int ret;
+
+ if (olpc_platform_info.flags & OLPC_F_EC_WIDE_SCI) {
+ __be16 ec_word;
+ ret = olpc_ec_cmd(EC_EXT_SCI_QUERY,
+ NULL, 0, (void *) &ec_word, 2);
+ if (ret == 0)
+ *sci_value = be16_to_cpu(ec_word);
+ } else {
+ unsigned char ec_byte;
+ ret = olpc_ec_cmd(EC_SCI_QUERY, NULL, 0, &ec_byte, 1);
+ if (ret == 0)
+ *sci_value = ec_byte;
}
- return be32_to_cpu(rev);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(olpc_ec_sci_query);
+
+static int olpc_ec_suspend(void)
+{
+ return olpc_ec_mask_write(ec_wakeup_mask);
+}
+
+static bool __init check_ofw_architecture(struct device_node *root)
+{
+ const char *olpc_arch;
+ int propsize;
+
+ olpc_arch = of_get_property(root, "architecture", &propsize);
+ return propsize == 5 && strncmp("OLPC", olpc_arch, 5) == 0;
+}
+
+static u32 __init get_board_revision(struct device_node *root)
+{
+ int propsize;
+ const __be32 *rev;
+
+ rev = of_get_property(root, "board-revision-int", &propsize);
+ if (propsize != 4)
+ return 0;
+
+ return be32_to_cpu(*rev);
}
static bool __init platform_detect(void)
{
- if (!check_ofw_architecture())
+ struct device_node *root = of_find_node_by_path("/");
+ bool success;
+
+ if (!root)
return false;
- olpc_platform_info.flags |= OLPC_F_PRESENT;
- olpc_platform_info.boardrev = get_board_revision();
- return true;
+
+ success = check_ofw_architecture(root);
+ if (success) {
+ olpc_platform_info.boardrev = get_board_revision(root);
+ olpc_platform_info.flags |= OLPC_F_PRESENT;
+ }
+
+ of_node_put(root);
+ return success;
}
static int __init add_xo1_platform_devices(void)
@@ -239,6 +328,10 @@ static int __init add_xo1_platform_devices(void)
return 0;
}
+static struct syscore_ops olpc_syscore_ops = {
+ .suspend = olpc_ec_suspend,
+};
+
static int __init olpc_init(void)
{
int r = 0;
@@ -263,6 +356,9 @@ static int __init olpc_init(void)
!cs5535_has_vsa2())
x86_init.pci.arch_init = pci_olpc_init;
#endif
+ /* EC version 0x5f adds support for wide SCI mask */
+ if (olpc_platform_info.ecver >= 0x5f)
+ olpc_platform_info.flags |= OLPC_F_EC_WIDE_SCI;
printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n",
((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "",
@@ -275,6 +371,8 @@ static int __init olpc_init(void)
return r;
}
+ register_syscore_ops(&olpc_syscore_ops);
+
return 0;
}
diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c
index 044bda5b317..d6ee9298692 100644
--- a/arch/x86/platform/olpc/olpc_dt.c
+++ b/arch/x86/platform/olpc/olpc_dt.c
@@ -19,7 +19,9 @@
#include <linux/kernel.h>
#include <linux/bootmem.h>
#include <linux/of.h>
+#include <linux/of_platform.h>
#include <linux/of_pdt.h>
+#include <asm/olpc.h>
#include <asm/olpc_ofw.h>
static phandle __init olpc_dt_getsibling(phandle node)
@@ -163,6 +165,107 @@ static struct of_pdt_ops prom_olpc_ops __initdata = {
.pkg2path = olpc_dt_pkg2path,
};
+static phandle __init olpc_dt_finddevice(const char *path)
+{
+ phandle node;
+ const void *args[] = { path };
+ void *res[] = { &node };
+
+ if (olpc_ofw("finddevice", args, res)) {
+ pr_err("olpc_dt: finddevice failed!\n");
+ return 0;
+ }
+
+ if ((s32) node == -1)
+ return 0;
+
+ return node;
+}
+
+static int __init olpc_dt_interpret(const char *words)
+{
+ int result;
+ const void *args[] = { words };
+ void *res[] = { &result };
+
+ if (olpc_ofw("interpret", args, res)) {
+ pr_err("olpc_dt: interpret failed!\n");
+ return -1;
+ }
+
+ return result;
+}
+
+/*
+ * Extract board revision directly from OFW device tree.
+ * We can't use olpc_platform_info because that hasn't been set up yet.
+ */
+static u32 __init olpc_dt_get_board_revision(void)
+{
+ phandle node;
+ __be32 rev;
+ int r;
+
+ node = olpc_dt_finddevice("/");
+ if (!node)
+ return 0;
+
+ r = olpc_dt_getproperty(node, "board-revision-int",
+ (char *) &rev, sizeof(rev));
+ if (r < 0)
+ return 0;
+
+ return be32_to_cpu(rev);
+}
+
+void __init olpc_dt_fixup(void)
+{
+ int r;
+ char buf[64];
+ phandle node;
+ u32 board_rev;
+
+ node = olpc_dt_finddevice("/battery@0");
+ if (!node)
+ return;
+
+ /*
+ * If the battery node has a compatible property, we are running a new
+ * enough firmware and don't have fixups to make.
+ */
+ r = olpc_dt_getproperty(node, "compatible", buf, sizeof(buf));
+ if (r > 0)
+ return;
+
+ pr_info("PROM DT: Old firmware detected, applying fixes\n");
+
+ /* Add olpc,xo1-battery compatible marker to battery node */
+ olpc_dt_interpret("\" /battery@0\" find-device"
+ " \" olpc,xo1-battery\" +compatible"
+ " device-end");
+
+ board_rev = olpc_dt_get_board_revision();
+ if (!board_rev)
+ return;
+
+ if (board_rev >= olpc_board_pre(0xd0)) {
+ /* XO-1.5: add dcon device */
+ olpc_dt_interpret("\" /pci/display@1\" find-device"
+ " new-device"
+ " \" dcon\" device-name \" olpc,xo1-dcon\" +compatible"
+ " finish-device device-end");
+ } else {
+ /* XO-1: add dcon device, mark RTC as olpc,xo1-rtc */
+ olpc_dt_interpret("\" /pci/display@1,1\" find-device"
+ " new-device"
+ " \" dcon\" device-name \" olpc,xo1-dcon\" +compatible"
+ " finish-device device-end"
+ " \" /rtc\" find-device"
+ " \" olpc,xo1-rtc\" +compatible"
+ " device-end");
+ }
+}
+
void __init olpc_dt_build_devicetree(void)
{
phandle root;
@@ -170,6 +273,8 @@ void __init olpc_dt_build_devicetree(void)
if (!olpc_ofw_is_installed())
return;
+ olpc_dt_fixup();
+
root = olpc_dt_getsibling(0);
if (!root) {
pr_err("PROM: unable to get root node from OFW!\n");
@@ -180,3 +285,20 @@ void __init olpc_dt_build_devicetree(void)
pr_info("PROM DT: Built device tree with %u bytes of memory.\n",
prom_early_allocated);
}
+
+/* A list of DT node/bus matches that we want to expose as platform devices */
+static struct of_device_id __initdata of_ids[] = {
+ { .compatible = "olpc,xo1-battery" },
+ { .compatible = "olpc,xo1-dcon" },
+ { .compatible = "olpc,xo1-rtc" },
+ {},
+};
+
+static int __init olpc_create_platform_devices(void)
+{
+ if (machine_is_olpc())
+ return of_platform_bus_probe(NULL, of_ids, NULL);
+ else
+ return 0;
+}
+device_initcall(olpc_create_platform_devices);
diff --git a/arch/x86/platform/olpc/xo1-wakeup.S b/arch/x86/platform/olpc/xo1-wakeup.S
new file mode 100644
index 00000000000..948deb28975
--- /dev/null
+++ b/arch/x86/platform/olpc/xo1-wakeup.S
@@ -0,0 +1,124 @@
+.text
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page.h>
+#include <asm/pgtable_32.h>
+
+ .macro writepost,value
+ movb $0x34, %al
+ outb %al, $0x70
+ movb $\value, %al
+ outb %al, $0x71
+ .endm
+
+wakeup_start:
+ # OFW lands us here, running in protected mode, with a
+ # kernel-compatible GDT already setup.
+
+ # Clear any dangerous flags
+ pushl $0
+ popfl
+
+ writepost 0x31
+
+ # Set up %cr3
+ movl $initial_page_table - __PAGE_OFFSET, %eax
+ movl %eax, %cr3
+
+ movl saved_cr4, %eax
+ movl %eax, %cr4
+
+ movl saved_cr0, %eax
+ movl %eax, %cr0
+
+ # Control registers were modified, pipeline resync is needed
+ jmp 1f
+1:
+
+ movw $__KERNEL_DS, %ax
+ movw %ax, %ss
+ movw %ax, %ds
+ movw %ax, %es
+ movw %ax, %fs
+ movw %ax, %gs
+
+ lgdt saved_gdt
+ lidt saved_idt
+ lldt saved_ldt
+ ljmp $(__KERNEL_CS),$1f
+1:
+ movl %cr3, %eax
+ movl %eax, %cr3
+ wbinvd
+
+ # Go back to the return point
+ jmp ret_point
+
+save_registers:
+ sgdt saved_gdt
+ sidt saved_idt
+ sldt saved_ldt
+
+ pushl %edx
+ movl %cr4, %edx
+ movl %edx, saved_cr4
+
+ movl %cr0, %edx
+ movl %edx, saved_cr0
+
+ popl %edx
+
+ movl %ebx, saved_context_ebx
+ movl %ebp, saved_context_ebp
+ movl %esi, saved_context_esi
+ movl %edi, saved_context_edi
+
+ pushfl
+ popl saved_context_eflags
+
+ ret
+
+restore_registers:
+ movl saved_context_ebp, %ebp
+ movl saved_context_ebx, %ebx
+ movl saved_context_esi, %esi
+ movl saved_context_edi, %edi
+
+ pushl saved_context_eflags
+ popfl
+
+ ret
+
+ENTRY(do_olpc_suspend_lowlevel)
+ call save_processor_state
+ call save_registers
+
+ # This is the stack context we want to remember
+ movl %esp, saved_context_esp
+
+ pushl $3
+ call xo1_do_sleep
+
+ jmp wakeup_start
+ .p2align 4,,7
+ret_point:
+ movl saved_context_esp, %esp
+
+ writepost 0x32
+
+ call restore_registers
+ call restore_processor_state
+ ret
+
+.data
+saved_gdt: .long 0,0
+saved_idt: .long 0,0
+saved_ldt: .long 0
+saved_cr4: .long 0
+saved_cr0: .long 0
+saved_context_esp: .long 0
+saved_context_edi: .long 0
+saved_context_esi: .long 0
+saved_context_ebx: .long 0
+saved_context_ebp: .long 0
+saved_context_eflags: .long 0
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 7cb6424317f..db8b915f54b 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1,7 +1,7 @@
/*
* SGI UltraViolet TLB flush routines.
*
- * (c) 2008-2010 Cliff Wickman <cpw@sgi.com>, SGI.
+ * (c) 2008-2011 Cliff Wickman <cpw@sgi.com>, SGI.
*
* This code is released under the GNU General Public License version 2 or
* later.
@@ -35,6 +35,7 @@ static int timeout_base_ns[] = {
5242880,
167772160
};
+
static int timeout_us;
static int nobau;
static int baudisabled;
@@ -42,20 +43,70 @@ static spinlock_t disable_lock;
static cycles_t congested_cycles;
/* tunables: */
-static int max_bau_concurrent = MAX_BAU_CONCURRENT;
-static int max_bau_concurrent_constant = MAX_BAU_CONCURRENT;
-static int plugged_delay = PLUGGED_DELAY;
-static int plugsb4reset = PLUGSB4RESET;
-static int timeoutsb4reset = TIMEOUTSB4RESET;
-static int ipi_reset_limit = IPI_RESET_LIMIT;
-static int complete_threshold = COMPLETE_THRESHOLD;
-static int congested_response_us = CONGESTED_RESPONSE_US;
-static int congested_reps = CONGESTED_REPS;
-static int congested_period = CONGESTED_PERIOD;
+static int max_concurr = MAX_BAU_CONCURRENT;
+static int max_concurr_const = MAX_BAU_CONCURRENT;
+static int plugged_delay = PLUGGED_DELAY;
+static int plugsb4reset = PLUGSB4RESET;
+static int timeoutsb4reset = TIMEOUTSB4RESET;
+static int ipi_reset_limit = IPI_RESET_LIMIT;
+static int complete_threshold = COMPLETE_THRESHOLD;
+static int congested_respns_us = CONGESTED_RESPONSE_US;
+static int congested_reps = CONGESTED_REPS;
+static int congested_period = CONGESTED_PERIOD;
+
+static struct tunables tunables[] = {
+ {&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */
+ {&plugged_delay, PLUGGED_DELAY},
+ {&plugsb4reset, PLUGSB4RESET},
+ {&timeoutsb4reset, TIMEOUTSB4RESET},
+ {&ipi_reset_limit, IPI_RESET_LIMIT},
+ {&complete_threshold, COMPLETE_THRESHOLD},
+ {&congested_respns_us, CONGESTED_RESPONSE_US},
+ {&congested_reps, CONGESTED_REPS},
+ {&congested_period, CONGESTED_PERIOD}
+};
+
static struct dentry *tunables_dir;
static struct dentry *tunables_file;
-static int __init setup_nobau(char *arg)
+/* these correspond to the statistics printed by ptc_seq_show() */
+static char *stat_description[] = {
+ "sent: number of shootdown messages sent",
+ "stime: time spent sending messages",
+ "numuvhubs: number of hubs targeted with shootdown",
+ "numuvhubs16: number times 16 or more hubs targeted",
+ "numuvhubs8: number times 8 or more hubs targeted",
+ "numuvhubs4: number times 4 or more hubs targeted",
+ "numuvhubs2: number times 2 or more hubs targeted",
+ "numuvhubs1: number times 1 hub targeted",
+ "numcpus: number of cpus targeted with shootdown",
+ "dto: number of destination timeouts",
+ "retries: destination timeout retries sent",
+ "rok: : destination timeouts successfully retried",
+ "resetp: ipi-style resource resets for plugs",
+ "resett: ipi-style resource resets for timeouts",
+ "giveup: fall-backs to ipi-style shootdowns",
+ "sto: number of source timeouts",
+ "bz: number of stay-busy's",
+ "throt: number times spun in throttle",
+ "swack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE",
+ "recv: shootdown messages received",
+ "rtime: time spent processing messages",
+ "all: shootdown all-tlb messages",
+ "one: shootdown one-tlb messages",
+ "mult: interrupts that found multiple messages",
+ "none: interrupts that found no messages",
+ "retry: number of retry messages processed",
+ "canc: number messages canceled by retries",
+ "nocan: number retries that found nothing to cancel",
+ "reset: number of ipi-style reset requests processed",
+ "rcan: number messages canceled by reset requests",
+ "disable: number times use of the BAU was disabled",
+ "enable: number times use of the BAU was re-enabled"
+};
+
+static int __init
+setup_nobau(char *arg)
{
nobau = 1;
return 0;
@@ -63,7 +114,7 @@ static int __init setup_nobau(char *arg)
early_param("nobau", setup_nobau);
/* base pnode in this partition */
-static int uv_partition_base_pnode __read_mostly;
+static int uv_base_pnode __read_mostly;
/* position of pnode (which is nasid>>1): */
static int uv_nshift __read_mostly;
static unsigned long uv_mmask __read_mostly;
@@ -109,60 +160,52 @@ static int __init uvhub_to_first_apicid(int uvhub)
* clear of the Timeout bit (as well) will free the resource. No reply will
* be sent (the hardware will only do one reply per message).
*/
-static inline void uv_reply_to_message(struct msg_desc *mdp,
- struct bau_control *bcp)
+static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp)
{
unsigned long dw;
- struct bau_payload_queue_entry *msg;
+ struct bau_pq_entry *msg;
msg = mdp->msg;
if (!msg->canceled) {
- dw = (msg->sw_ack_vector << UV_SW_ACK_NPENDING) |
- msg->sw_ack_vector;
- uv_write_local_mmr(
- UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
+ dw = (msg->swack_vec << UV_SW_ACK_NPENDING) | msg->swack_vec;
+ write_mmr_sw_ack(dw);
}
msg->replied_to = 1;
- msg->sw_ack_vector = 0;
+ msg->swack_vec = 0;
}
/*
* Process the receipt of a RETRY message
*/
-static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
- struct bau_control *bcp)
+static void bau_process_retry_msg(struct msg_desc *mdp,
+ struct bau_control *bcp)
{
int i;
int cancel_count = 0;
- int slot2;
unsigned long msg_res;
unsigned long mmr = 0;
- struct bau_payload_queue_entry *msg;
- struct bau_payload_queue_entry *msg2;
- struct ptc_stats *stat;
+ struct bau_pq_entry *msg = mdp->msg;
+ struct bau_pq_entry *msg2;
+ struct ptc_stats *stat = bcp->statp;
- msg = mdp->msg;
- stat = bcp->statp;
stat->d_retries++;
/*
* cancel any message from msg+1 to the retry itself
*/
for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) {
- if (msg2 > mdp->va_queue_last)
- msg2 = mdp->va_queue_first;
+ if (msg2 > mdp->queue_last)
+ msg2 = mdp->queue_first;
if (msg2 == msg)
break;
- /* same conditions for cancellation as uv_do_reset */
+ /* same conditions for cancellation as do_reset */
if ((msg2->replied_to == 0) && (msg2->canceled == 0) &&
- (msg2->sw_ack_vector) && ((msg2->sw_ack_vector &
- msg->sw_ack_vector) == 0) &&
+ (msg2->swack_vec) && ((msg2->swack_vec &
+ msg->swack_vec) == 0) &&
(msg2->sending_cpu == msg->sending_cpu) &&
(msg2->msg_type != MSG_NOOP)) {
- slot2 = msg2 - mdp->va_queue_first;
- mmr = uv_read_local_mmr
- (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
- msg_res = msg2->sw_ack_vector;
+ mmr = read_mmr_sw_ack();
+ msg_res = msg2->swack_vec;
/*
* This is a message retry; clear the resources held
* by the previous message only if they timed out.
@@ -170,6 +213,7 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
* situation to report.
*/
if (mmr & (msg_res << UV_SW_ACK_NPENDING)) {
+ unsigned long mr;
/*
* is the resource timed out?
* make everyone ignore the cancelled message.
@@ -177,10 +221,8 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
msg2->canceled = 1;
stat->d_canceled++;
cancel_count++;
- uv_write_local_mmr(
- UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
- (msg_res << UV_SW_ACK_NPENDING) |
- msg_res);
+ mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res;
+ write_mmr_sw_ack(mr);
}
}
}
@@ -192,20 +234,19 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
* Do all the things a cpu should do for a TLB shootdown message.
* Other cpu's may come here at the same time for this message.
*/
-static void uv_bau_process_message(struct msg_desc *mdp,
- struct bau_control *bcp)
+static void bau_process_message(struct msg_desc *mdp,
+ struct bau_control *bcp)
{
- int msg_ack_count;
short socket_ack_count = 0;
- struct ptc_stats *stat;
- struct bau_payload_queue_entry *msg;
+ short *sp;
+ struct atomic_short *asp;
+ struct ptc_stats *stat = bcp->statp;
+ struct bau_pq_entry *msg = mdp->msg;
struct bau_control *smaster = bcp->socket_master;
/*
* This must be a normal message, or retry of a normal message
*/
- msg = mdp->msg;
- stat = bcp->statp;
if (msg->address == TLB_FLUSH_ALL) {
local_flush_tlb();
stat->d_alltlb++;
@@ -222,30 +263,32 @@ static void uv_bau_process_message(struct msg_desc *mdp,
* cpu number.
*/
if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master)
- uv_bau_process_retry_msg(mdp, bcp);
+ bau_process_retry_msg(mdp, bcp);
/*
- * This is a sw_ack message, so we have to reply to it.
+ * This is a swack message, so we have to reply to it.
* Count each responding cpu on the socket. This avoids
* pinging the count's cache line back and forth between
* the sockets.
*/
- socket_ack_count = atomic_add_short_return(1, (struct atomic_short *)
- &smaster->socket_acknowledge_count[mdp->msg_slot]);
+ sp = &smaster->socket_acknowledge_count[mdp->msg_slot];
+ asp = (struct atomic_short *)sp;
+ socket_ack_count = atom_asr(1, asp);
if (socket_ack_count == bcp->cpus_in_socket) {
+ int msg_ack_count;
/*
* Both sockets dump their completed count total into
* the message's count.
*/
smaster->socket_acknowledge_count[mdp->msg_slot] = 0;
- msg_ack_count = atomic_add_short_return(socket_ack_count,
- (struct atomic_short *)&msg->acknowledge_count);
+ asp = (struct atomic_short *)&msg->acknowledge_count;
+ msg_ack_count = atom_asr(socket_ack_count, asp);
if (msg_ack_count == bcp->cpus_in_uvhub) {
/*
* All cpus in uvhub saw it; reply
*/
- uv_reply_to_message(mdp, bcp);
+ reply_to_message(mdp, bcp);
}
}
@@ -253,14 +296,18 @@ static void uv_bau_process_message(struct msg_desc *mdp,
}
/*
- * Determine the first cpu on a uvhub.
+ * Determine the first cpu on a pnode.
*/
-static int uvhub_to_first_cpu(int uvhub)
+static int pnode_to_first_cpu(int pnode, struct bau_control *smaster)
{
int cpu;
- for_each_present_cpu(cpu)
- if (uvhub == uv_cpu_to_blade_id(cpu))
+ struct hub_and_pnode *hpp;
+
+ for_each_present_cpu(cpu) {
+ hpp = &smaster->thp[cpu];
+ if (pnode == hpp->pnode)
return cpu;
+ }
return -1;
}
@@ -268,62 +315,51 @@ static int uvhub_to_first_cpu(int uvhub)
* Last resort when we get a large number of destination timeouts is
* to clear resources held by a given cpu.
* Do this with IPI so that all messages in the BAU message queue
- * can be identified by their nonzero sw_ack_vector field.
+ * can be identified by their nonzero swack_vec field.
*
* This is entered for a single cpu on the uvhub.
* The sender want's this uvhub to free a specific message's
- * sw_ack resources.
+ * swack resources.
*/
-static void
-uv_do_reset(void *ptr)
+static void do_reset(void *ptr)
{
int i;
- int slot;
- int count = 0;
- unsigned long mmr;
- unsigned long msg_res;
- struct bau_control *bcp;
- struct reset_args *rap;
- struct bau_payload_queue_entry *msg;
- struct ptc_stats *stat;
+ struct bau_control *bcp = &per_cpu(bau_control, smp_processor_id());
+ struct reset_args *rap = (struct reset_args *)ptr;
+ struct bau_pq_entry *msg;
+ struct ptc_stats *stat = bcp->statp;
- bcp = &per_cpu(bau_control, smp_processor_id());
- rap = (struct reset_args *)ptr;
- stat = bcp->statp;
stat->d_resets++;
-
/*
* We're looking for the given sender, and
- * will free its sw_ack resource.
+ * will free its swack resource.
* If all cpu's finally responded after the timeout, its
* message 'replied_to' was set.
*/
- for (msg = bcp->va_queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) {
- /* uv_do_reset: same conditions for cancellation as
- uv_bau_process_retry_msg() */
+ for (msg = bcp->queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) {
+ unsigned long msg_res;
+ /* do_reset: same conditions for cancellation as
+ bau_process_retry_msg() */
if ((msg->replied_to == 0) &&
(msg->canceled == 0) &&
(msg->sending_cpu == rap->sender) &&
- (msg->sw_ack_vector) &&
+ (msg->swack_vec) &&
(msg->msg_type != MSG_NOOP)) {
+ unsigned long mmr;
+ unsigned long mr;
/*
* make everyone else ignore this message
*/
msg->canceled = 1;
- slot = msg - bcp->va_queue_first;
- count++;
/*
* only reset the resource if it is still pending
*/
- mmr = uv_read_local_mmr
- (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
- msg_res = msg->sw_ack_vector;
+ mmr = read_mmr_sw_ack();
+ msg_res = msg->swack_vec;
+ mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res;
if (mmr & msg_res) {
stat->d_rcanceled++;
- uv_write_local_mmr(
- UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
- (msg_res << UV_SW_ACK_NPENDING) |
- msg_res);
+ write_mmr_sw_ack(mr);
}
}
}
@@ -334,39 +370,42 @@ uv_do_reset(void *ptr)
* Use IPI to get all target uvhubs to release resources held by
* a given sending cpu number.
*/
-static void uv_reset_with_ipi(struct bau_target_uvhubmask *distribution,
- int sender)
+static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
{
- int uvhub;
- int cpu;
- cpumask_t mask;
+ int pnode;
+ int apnode;
+ int maskbits;
+ int sender = bcp->cpu;
+ cpumask_t *mask = bcp->uvhub_master->cpumask;
+ struct bau_control *smaster = bcp->socket_master;
struct reset_args reset_args;
reset_args.sender = sender;
-
- cpus_clear(mask);
+ cpus_clear(*mask);
/* find a single cpu for each uvhub in this distribution mask */
- for (uvhub = 0;
- uvhub < sizeof(struct bau_target_uvhubmask) * BITSPERBYTE;
- uvhub++) {
- if (!bau_uvhub_isset(uvhub, distribution))
+ maskbits = sizeof(struct pnmask) * BITSPERBYTE;
+ /* each bit is a pnode relative to the partition base pnode */
+ for (pnode = 0; pnode < maskbits; pnode++) {
+ int cpu;
+ if (!bau_uvhub_isset(pnode, distribution))
continue;
- /* find a cpu for this uvhub */
- cpu = uvhub_to_first_cpu(uvhub);
- cpu_set(cpu, mask);
+ apnode = pnode + bcp->partition_base_pnode;
+ cpu = pnode_to_first_cpu(apnode, smaster);
+ cpu_set(cpu, *mask);
}
- /* IPI all cpus; Preemption is already disabled */
- smp_call_function_many(&mask, uv_do_reset, (void *)&reset_args, 1);
+
+ /* IPI all cpus; preemption is already disabled */
+ smp_call_function_many(mask, do_reset, (void *)&reset_args, 1);
return;
}
-static inline unsigned long
-cycles_2_us(unsigned long long cyc)
+static inline unsigned long cycles_2_us(unsigned long long cyc)
{
unsigned long long ns;
unsigned long us;
- ns = (cyc * per_cpu(cyc2ns, smp_processor_id()))
- >> CYC2NS_SCALE_FACTOR;
+ int cpu = smp_processor_id();
+
+ ns = (cyc * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR;
us = ns / 1000;
return us;
}
@@ -376,56 +415,56 @@ cycles_2_us(unsigned long long cyc)
* leaves uvhub_quiesce set so that no new broadcasts are started by
* bau_flush_send_and_wait()
*/
-static inline void
-quiesce_local_uvhub(struct bau_control *hmaster)
+static inline void quiesce_local_uvhub(struct bau_control *hmaster)
{
- atomic_add_short_return(1, (struct atomic_short *)
- &hmaster->uvhub_quiesce);
+ atom_asr(1, (struct atomic_short *)&hmaster->uvhub_quiesce);
}
/*
* mark this quiet-requestor as done
*/
-static inline void
-end_uvhub_quiesce(struct bau_control *hmaster)
+static inline void end_uvhub_quiesce(struct bau_control *hmaster)
{
- atomic_add_short_return(-1, (struct atomic_short *)
- &hmaster->uvhub_quiesce);
+ atom_asr(-1, (struct atomic_short *)&hmaster->uvhub_quiesce);
+}
+
+static unsigned long uv1_read_status(unsigned long mmr_offset, int right_shift)
+{
+ unsigned long descriptor_status;
+
+ descriptor_status = uv_read_local_mmr(mmr_offset);
+ descriptor_status >>= right_shift;
+ descriptor_status &= UV_ACT_STATUS_MASK;
+ return descriptor_status;
}
/*
* Wait for completion of a broadcast software ack message
* return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP
*/
-static int uv_wait_completion(struct bau_desc *bau_desc,
- unsigned long mmr_offset, int right_shift, int this_cpu,
- struct bau_control *bcp, struct bau_control *smaster, long try)
+static int uv1_wait_completion(struct bau_desc *bau_desc,
+ unsigned long mmr_offset, int right_shift,
+ struct bau_control *bcp, long try)
{
unsigned long descriptor_status;
- cycles_t ttime;
+ cycles_t ttm;
struct ptc_stats *stat = bcp->statp;
- struct bau_control *hmaster;
-
- hmaster = bcp->uvhub_master;
+ descriptor_status = uv1_read_status(mmr_offset, right_shift);
/* spin on the status MMR, waiting for it to go idle */
- while ((descriptor_status = (((unsigned long)
- uv_read_local_mmr(mmr_offset) >>
- right_shift) & UV_ACT_STATUS_MASK)) !=
- DESC_STATUS_IDLE) {
+ while ((descriptor_status != DS_IDLE)) {
/*
- * Our software ack messages may be blocked because there are
- * no swack resources available. As long as none of them
- * has timed out hardware will NACK our message and its
- * state will stay IDLE.
+ * Our software ack messages may be blocked because
+ * there are no swack resources available. As long
+ * as none of them has timed out hardware will NACK
+ * our message and its state will stay IDLE.
*/
- if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
+ if (descriptor_status == DS_SOURCE_TIMEOUT) {
stat->s_stimeout++;
return FLUSH_GIVEUP;
- } else if (descriptor_status ==
- DESC_STATUS_DESTINATION_TIMEOUT) {
+ } else if (descriptor_status == DS_DESTINATION_TIMEOUT) {
stat->s_dtimeout++;
- ttime = get_cycles();
+ ttm = get_cycles();
/*
* Our retries may be blocked by all destination
@@ -433,8 +472,7 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
* pending. In that case hardware returns the
* ERROR that looks like a destination timeout.
*/
- if (cycles_2_us(ttime - bcp->send_message) <
- timeout_us) {
+ if (cycles_2_us(ttm - bcp->send_message) < timeout_us) {
bcp->conseccompletes = 0;
return FLUSH_RETRY_PLUGGED;
}
@@ -447,80 +485,160 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
*/
cpu_relax();
}
+ descriptor_status = uv1_read_status(mmr_offset, right_shift);
}
bcp->conseccompletes++;
return FLUSH_COMPLETE;
}
-static inline cycles_t
-sec_2_cycles(unsigned long sec)
+/*
+ * UV2 has an extra bit of status in the ACTIVATION_STATUS_2 register.
+ */
+static unsigned long uv2_read_status(unsigned long offset, int rshft, int cpu)
{
- unsigned long ns;
- cycles_t cyc;
+ unsigned long descriptor_status;
+ unsigned long descriptor_status2;
- ns = sec * 1000000000;
- cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
- return cyc;
+ descriptor_status = ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK);
+ descriptor_status2 = (read_mmr_uv2_status() >> cpu) & 0x1UL;
+ descriptor_status = (descriptor_status << 1) | descriptor_status2;
+ return descriptor_status;
+}
+
+static int uv2_wait_completion(struct bau_desc *bau_desc,
+ unsigned long mmr_offset, int right_shift,
+ struct bau_control *bcp, long try)
+{
+ unsigned long descriptor_stat;
+ cycles_t ttm;
+ int cpu = bcp->uvhub_cpu;
+ struct ptc_stats *stat = bcp->statp;
+
+ descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu);
+
+ /* spin on the status MMR, waiting for it to go idle */
+ while (descriptor_stat != UV2H_DESC_IDLE) {
+ /*
+ * Our software ack messages may be blocked because
+ * there are no swack resources available. As long
+ * as none of them has timed out hardware will NACK
+ * our message and its state will stay IDLE.
+ */
+ if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT) ||
+ (descriptor_stat == UV2H_DESC_DEST_STRONG_NACK) ||
+ (descriptor_stat == UV2H_DESC_DEST_PUT_ERR)) {
+ stat->s_stimeout++;
+ return FLUSH_GIVEUP;
+ } else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) {
+ stat->s_dtimeout++;
+ ttm = get_cycles();
+ /*
+ * Our retries may be blocked by all destination
+ * swack resources being consumed, and a timeout
+ * pending. In that case hardware returns the
+ * ERROR that looks like a destination timeout.
+ */
+ if (cycles_2_us(ttm - bcp->send_message) < timeout_us) {
+ bcp->conseccompletes = 0;
+ return FLUSH_RETRY_PLUGGED;
+ }
+ bcp->conseccompletes = 0;
+ return FLUSH_RETRY_TIMEOUT;
+ } else {
+ /*
+ * descriptor_stat is still BUSY
+ */
+ cpu_relax();
+ }
+ descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu);
+ }
+ bcp->conseccompletes++;
+ return FLUSH_COMPLETE;
}
/*
- * conditionally add 1 to *v, unless *v is >= u
- * return 0 if we cannot add 1 to *v because it is >= u
- * return 1 if we can add 1 to *v because it is < u
- * the add is atomic
- *
- * This is close to atomic_add_unless(), but this allows the 'u' value
- * to be lowered below the current 'v'. atomic_add_unless can only stop
- * on equal.
+ * There are 2 status registers; each and array[32] of 2 bits. Set up for
+ * which register to read and position in that register based on cpu in
+ * current hub.
*/
-static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
+static int wait_completion(struct bau_desc *bau_desc,
+ struct bau_control *bcp, long try)
{
- spin_lock(lock);
- if (atomic_read(v) >= u) {
- spin_unlock(lock);
- return 0;
+ int right_shift;
+ unsigned long mmr_offset;
+ int cpu = bcp->uvhub_cpu;
+
+ if (cpu < UV_CPUS_PER_AS) {
+ mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
+ right_shift = cpu * UV_ACT_STATUS_SIZE;
+ } else {
+ mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
+ right_shift = ((cpu - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE);
}
- atomic_inc(v);
- spin_unlock(lock);
- return 1;
+
+ if (is_uv1_hub())
+ return uv1_wait_completion(bau_desc, mmr_offset, right_shift,
+ bcp, try);
+ else
+ return uv2_wait_completion(bau_desc, mmr_offset, right_shift,
+ bcp, try);
+}
+
+static inline cycles_t sec_2_cycles(unsigned long sec)
+{
+ unsigned long ns;
+ cycles_t cyc;
+
+ ns = sec * 1000000000;
+ cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
+ return cyc;
}
/*
- * Our retries are blocked by all destination swack resources being
+ * Our retries are blocked by all destination sw ack resources being
* in use, and a timeout is pending. In that case hardware immediately
* returns the ERROR that looks like a destination timeout.
*/
-static void
-destination_plugged(struct bau_desc *bau_desc, struct bau_control *bcp,
+static void destination_plugged(struct bau_desc *bau_desc,
+ struct bau_control *bcp,
struct bau_control *hmaster, struct ptc_stats *stat)
{
udelay(bcp->plugged_delay);
bcp->plugged_tries++;
+
if (bcp->plugged_tries >= bcp->plugsb4reset) {
bcp->plugged_tries = 0;
+
quiesce_local_uvhub(hmaster);
+
spin_lock(&hmaster->queue_lock);
- uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
+ reset_with_ipi(&bau_desc->distribution, bcp);
spin_unlock(&hmaster->queue_lock);
+
end_uvhub_quiesce(hmaster);
+
bcp->ipi_attempts++;
stat->s_resets_plug++;
}
}
-static void
-destination_timeout(struct bau_desc *bau_desc, struct bau_control *bcp,
- struct bau_control *hmaster, struct ptc_stats *stat)
+static void destination_timeout(struct bau_desc *bau_desc,
+ struct bau_control *bcp, struct bau_control *hmaster,
+ struct ptc_stats *stat)
{
- hmaster->max_bau_concurrent = 1;
+ hmaster->max_concurr = 1;
bcp->timeout_tries++;
if (bcp->timeout_tries >= bcp->timeoutsb4reset) {
bcp->timeout_tries = 0;
+
quiesce_local_uvhub(hmaster);
+
spin_lock(&hmaster->queue_lock);
- uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
+ reset_with_ipi(&bau_desc->distribution, bcp);
spin_unlock(&hmaster->queue_lock);
+
end_uvhub_quiesce(hmaster);
+
bcp->ipi_attempts++;
stat->s_resets_timeout++;
}
@@ -530,34 +648,104 @@ destination_timeout(struct bau_desc *bau_desc, struct bau_control *bcp,
* Completions are taking a very long time due to a congested numalink
* network.
*/
-static void
-disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat)
+static void disable_for_congestion(struct bau_control *bcp,
+ struct ptc_stats *stat)
{
- int tcpu;
- struct bau_control *tbcp;
-
/* let only one cpu do this disabling */
spin_lock(&disable_lock);
+
if (!baudisabled && bcp->period_requests &&
((bcp->period_time / bcp->period_requests) > congested_cycles)) {
+ int tcpu;
+ struct bau_control *tbcp;
/* it becomes this cpu's job to turn on the use of the
BAU again */
baudisabled = 1;
bcp->set_bau_off = 1;
- bcp->set_bau_on_time = get_cycles() +
- sec_2_cycles(bcp->congested_period);
+ bcp->set_bau_on_time = get_cycles();
+ bcp->set_bau_on_time += sec_2_cycles(bcp->cong_period);
stat->s_bau_disabled++;
for_each_present_cpu(tcpu) {
tbcp = &per_cpu(bau_control, tcpu);
- tbcp->baudisabled = 1;
+ tbcp->baudisabled = 1;
}
}
+
spin_unlock(&disable_lock);
}
-/**
- * uv_flush_send_and_wait
- *
+static void count_max_concurr(int stat, struct bau_control *bcp,
+ struct bau_control *hmaster)
+{
+ bcp->plugged_tries = 0;
+ bcp->timeout_tries = 0;
+ if (stat != FLUSH_COMPLETE)
+ return;
+ if (bcp->conseccompletes <= bcp->complete_threshold)
+ return;
+ if (hmaster->max_concurr >= hmaster->max_concurr_const)
+ return;
+ hmaster->max_concurr++;
+}
+
+static void record_send_stats(cycles_t time1, cycles_t time2,
+ struct bau_control *bcp, struct ptc_stats *stat,
+ int completion_status, int try)
+{
+ cycles_t elapsed;
+
+ if (time2 > time1) {
+ elapsed = time2 - time1;
+ stat->s_time += elapsed;
+
+ if ((completion_status == FLUSH_COMPLETE) && (try == 1)) {
+ bcp->period_requests++;
+ bcp->period_time += elapsed;
+ if ((elapsed > congested_cycles) &&
+ (bcp->period_requests > bcp->cong_reps))
+ disable_for_congestion(bcp, stat);
+ }
+ } else
+ stat->s_requestor--;
+
+ if (completion_status == FLUSH_COMPLETE && try > 1)
+ stat->s_retriesok++;
+ else if (completion_status == FLUSH_GIVEUP)
+ stat->s_giveup++;
+}
+
+/*
+ * Because of a uv1 hardware bug only a limited number of concurrent
+ * requests can be made.
+ */
+static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat)
+{
+ spinlock_t *lock = &hmaster->uvhub_lock;
+ atomic_t *v;
+
+ v = &hmaster->active_descriptor_count;
+ if (!atomic_inc_unless_ge(lock, v, hmaster->max_concurr)) {
+ stat->s_throttles++;
+ do {
+ cpu_relax();
+ } while (!atomic_inc_unless_ge(lock, v, hmaster->max_concurr));
+ }
+}
+
+/*
+ * Handle the completion status of a message send.
+ */
+static void handle_cmplt(int completion_status, struct bau_desc *bau_desc,
+ struct bau_control *bcp, struct bau_control *hmaster,
+ struct ptc_stats *stat)
+{
+ if (completion_status == FLUSH_RETRY_PLUGGED)
+ destination_plugged(bau_desc, bcp, hmaster, stat);
+ else if (completion_status == FLUSH_RETRY_TIMEOUT)
+ destination_timeout(bau_desc, bcp, hmaster, stat);
+}
+
+/*
* Send a broadcast and wait for it to complete.
*
* The flush_mask contains the cpus the broadcast is to be sent to including
@@ -568,44 +756,23 @@ disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat)
* returned to the kernel.
*/
int uv_flush_send_and_wait(struct bau_desc *bau_desc,
- struct cpumask *flush_mask, struct bau_control *bcp)
+ struct cpumask *flush_mask, struct bau_control *bcp)
{
- int right_shift;
- int completion_status = 0;
int seq_number = 0;
+ int completion_stat = 0;
long try = 0;
- int cpu = bcp->uvhub_cpu;
- int this_cpu = bcp->cpu;
- unsigned long mmr_offset;
unsigned long index;
cycles_t time1;
cycles_t time2;
- cycles_t elapsed;
struct ptc_stats *stat = bcp->statp;
- struct bau_control *smaster = bcp->socket_master;
struct bau_control *hmaster = bcp->uvhub_master;
- if (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
- &hmaster->active_descriptor_count,
- hmaster->max_bau_concurrent)) {
- stat->s_throttles++;
- do {
- cpu_relax();
- } while (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
- &hmaster->active_descriptor_count,
- hmaster->max_bau_concurrent));
- }
+ if (is_uv1_hub())
+ uv1_throttle(hmaster, stat);
+
while (hmaster->uvhub_quiesce)
cpu_relax();
- if (cpu < UV_CPUS_PER_ACT_STATUS) {
- mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
- right_shift = cpu * UV_ACT_STATUS_SIZE;
- } else {
- mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
- right_shift =
- ((cpu - UV_CPUS_PER_ACT_STATUS) * UV_ACT_STATUS_SIZE);
- }
time1 = get_cycles();
do {
if (try == 0) {
@@ -615,64 +782,134 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
bau_desc->header.msg_type = MSG_RETRY;
stat->s_retry_messages++;
}
+
bau_desc->header.sequence = seq_number;
- index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) |
- bcp->uvhub_cpu;
+ index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu;
bcp->send_message = get_cycles();
- uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
+
+ write_mmr_activation(index);
+
try++;
- completion_status = uv_wait_completion(bau_desc, mmr_offset,
- right_shift, this_cpu, bcp, smaster, try);
+ completion_stat = wait_completion(bau_desc, bcp, try);
+
+ handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat);
- if (completion_status == FLUSH_RETRY_PLUGGED) {
- destination_plugged(bau_desc, bcp, hmaster, stat);
- } else if (completion_status == FLUSH_RETRY_TIMEOUT) {
- destination_timeout(bau_desc, bcp, hmaster, stat);
- }
if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
bcp->ipi_attempts = 0;
- completion_status = FLUSH_GIVEUP;
+ completion_stat = FLUSH_GIVEUP;
break;
}
cpu_relax();
- } while ((completion_status == FLUSH_RETRY_PLUGGED) ||
- (completion_status == FLUSH_RETRY_TIMEOUT));
+ } while ((completion_stat == FLUSH_RETRY_PLUGGED) ||
+ (completion_stat == FLUSH_RETRY_TIMEOUT));
+
time2 = get_cycles();
- bcp->plugged_tries = 0;
- bcp->timeout_tries = 0;
- if ((completion_status == FLUSH_COMPLETE) &&
- (bcp->conseccompletes > bcp->complete_threshold) &&
- (hmaster->max_bau_concurrent <
- hmaster->max_bau_concurrent_constant))
- hmaster->max_bau_concurrent++;
+
+ count_max_concurr(completion_stat, bcp, hmaster);
+
while (hmaster->uvhub_quiesce)
cpu_relax();
+
atomic_dec(&hmaster->active_descriptor_count);
- if (time2 > time1) {
- elapsed = time2 - time1;
- stat->s_time += elapsed;
- if ((completion_status == FLUSH_COMPLETE) && (try == 1)) {
- bcp->period_requests++;
- bcp->period_time += elapsed;
- if ((elapsed > congested_cycles) &&
- (bcp->period_requests > bcp->congested_reps)) {
- disable_for_congestion(bcp, stat);
+
+ record_send_stats(time1, time2, bcp, stat, completion_stat, try);
+
+ if (completion_stat == FLUSH_GIVEUP)
+ return 1;
+ return 0;
+}
+
+/*
+ * The BAU is disabled. When the disabled time period has expired, the cpu
+ * that disabled it must re-enable it.
+ * Return 0 if it is re-enabled for all cpus.
+ */
+static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
+{
+ int tcpu;
+ struct bau_control *tbcp;
+
+ if (bcp->set_bau_off) {
+ if (get_cycles() >= bcp->set_bau_on_time) {
+ stat->s_bau_reenabled++;
+ baudisabled = 0;
+ for_each_present_cpu(tcpu) {
+ tbcp = &per_cpu(bau_control, tcpu);
+ tbcp->baudisabled = 0;
+ tbcp->period_requests = 0;
+ tbcp->period_time = 0;
}
+ return 0;
}
+ }
+ return -1;
+}
+
+static void record_send_statistics(struct ptc_stats *stat, int locals, int hubs,
+ int remotes, struct bau_desc *bau_desc)
+{
+ stat->s_requestor++;
+ stat->s_ntargcpu += remotes + locals;
+ stat->s_ntargremotes += remotes;
+ stat->s_ntarglocals += locals;
+
+ /* uvhub statistics */
+ hubs = bau_uvhub_weight(&bau_desc->distribution);
+ if (locals) {
+ stat->s_ntarglocaluvhub++;
+ stat->s_ntargremoteuvhub += (hubs - 1);
} else
- stat->s_requestor--;
- if (completion_status == FLUSH_COMPLETE && try > 1)
- stat->s_retriesok++;
- else if (completion_status == FLUSH_GIVEUP) {
- stat->s_giveup++;
- return 1;
+ stat->s_ntargremoteuvhub += hubs;
+
+ stat->s_ntarguvhub += hubs;
+
+ if (hubs >= 16)
+ stat->s_ntarguvhub16++;
+ else if (hubs >= 8)
+ stat->s_ntarguvhub8++;
+ else if (hubs >= 4)
+ stat->s_ntarguvhub4++;
+ else if (hubs >= 2)
+ stat->s_ntarguvhub2++;
+ else
+ stat->s_ntarguvhub1++;
+}
+
+/*
+ * Translate a cpu mask to the uvhub distribution mask in the BAU
+ * activation descriptor.
+ */
+static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp,
+ struct bau_desc *bau_desc, int *localsp, int *remotesp)
+{
+ int cpu;
+ int pnode;
+ int cnt = 0;
+ struct hub_and_pnode *hpp;
+
+ for_each_cpu(cpu, flush_mask) {
+ /*
+ * The distribution vector is a bit map of pnodes, relative
+ * to the partition base pnode (and the partition base nasid
+ * in the header).
+ * Translate cpu to pnode and hub using a local memory array.
+ */
+ hpp = &bcp->socket_master->thp[cpu];
+ pnode = hpp->pnode - bcp->partition_base_pnode;
+ bau_uvhub_set(pnode, &bau_desc->distribution);
+ cnt++;
+ if (hpp->uvhub == bcp->uvhub)
+ (*localsp)++;
+ else
+ (*remotesp)++;
}
+ if (!cnt)
+ return 1;
return 0;
}
-/**
- * uv_flush_tlb_others - globally purge translation cache of a virtual
- * address or all TLB's
+/*
+ * globally purge translation cache of a virtual address or all TLB's
* @cpumask: mask of all cpu's in which the address is to be removed
* @mm: mm_struct containing virtual address range
* @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
@@ -696,11 +933,9 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
* done. The returned pointer is valid till preemption is re-enabled.
*/
const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm,
- unsigned long va, unsigned int cpu)
+ struct mm_struct *mm, unsigned long va,
+ unsigned int cpu)
{
- int tcpu;
- int uvhub;
int locals = 0;
int remotes = 0;
int hubs = 0;
@@ -708,7 +943,6 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
struct cpumask *flush_mask;
struct ptc_stats *stat;
struct bau_control *bcp;
- struct bau_control *tbcp;
/* kernel was booted 'nobau' */
if (nobau)
@@ -719,20 +953,8 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
/* bau was disabled due to slow response */
if (bcp->baudisabled) {
- /* the cpu that disabled it must re-enable it */
- if (bcp->set_bau_off) {
- if (get_cycles() >= bcp->set_bau_on_time) {
- stat->s_bau_reenabled++;
- baudisabled = 0;
- for_each_present_cpu(tcpu) {
- tbcp = &per_cpu(bau_control, tcpu);
- tbcp->baudisabled = 0;
- tbcp->period_requests = 0;
- tbcp->period_time = 0;
- }
- }
- }
- return cpumask;
+ if (check_enable(bcp, stat))
+ return cpumask;
}
/*
@@ -743,52 +965,20 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
/* don't actually do a shootdown of the local cpu */
cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
+
if (cpu_isset(cpu, *cpumask))
stat->s_ntargself++;
bau_desc = bcp->descriptor_base;
- bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
+ bau_desc += ITEMS_PER_DESC * bcp->uvhub_cpu;
bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
-
- /* cpu statistics */
- for_each_cpu(tcpu, flush_mask) {
- uvhub = uv_cpu_to_blade_id(tcpu);
- bau_uvhub_set(uvhub, &bau_desc->distribution);
- if (uvhub == bcp->uvhub)
- locals++;
- else
- remotes++;
- }
- if ((locals + remotes) == 0)
+ if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes))
return NULL;
- stat->s_requestor++;
- stat->s_ntargcpu += remotes + locals;
- stat->s_ntargremotes += remotes;
- stat->s_ntarglocals += locals;
- remotes = bau_uvhub_weight(&bau_desc->distribution);
- /* uvhub statistics */
- hubs = bau_uvhub_weight(&bau_desc->distribution);
- if (locals) {
- stat->s_ntarglocaluvhub++;
- stat->s_ntargremoteuvhub += (hubs - 1);
- } else
- stat->s_ntargremoteuvhub += hubs;
- stat->s_ntarguvhub += hubs;
- if (hubs >= 16)
- stat->s_ntarguvhub16++;
- else if (hubs >= 8)
- stat->s_ntarguvhub8++;
- else if (hubs >= 4)
- stat->s_ntarguvhub4++;
- else if (hubs >= 2)
- stat->s_ntarguvhub2++;
- else
- stat->s_ntarguvhub1++;
+ record_send_statistics(stat, locals, hubs, remotes, bau_desc);
bau_desc->payload.address = va;
bau_desc->payload.sending_cpu = cpu;
-
/*
* uv_flush_send_and_wait returns 0 if all cpu's were messaged,
* or 1 if it gave up and the original cpumask should be returned.
@@ -817,26 +1007,31 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
{
int count = 0;
cycles_t time_start;
- struct bau_payload_queue_entry *msg;
+ struct bau_pq_entry *msg;
struct bau_control *bcp;
struct ptc_stats *stat;
struct msg_desc msgdesc;
time_start = get_cycles();
+
bcp = &per_cpu(bau_control, smp_processor_id());
stat = bcp->statp;
- msgdesc.va_queue_first = bcp->va_queue_first;
- msgdesc.va_queue_last = bcp->va_queue_last;
+
+ msgdesc.queue_first = bcp->queue_first;
+ msgdesc.queue_last = bcp->queue_last;
+
msg = bcp->bau_msg_head;
- while (msg->sw_ack_vector) {
+ while (msg->swack_vec) {
count++;
- msgdesc.msg_slot = msg - msgdesc.va_queue_first;
- msgdesc.sw_ack_slot = ffs(msg->sw_ack_vector) - 1;
+
+ msgdesc.msg_slot = msg - msgdesc.queue_first;
+ msgdesc.swack_slot = ffs(msg->swack_vec) - 1;
msgdesc.msg = msg;
- uv_bau_process_message(&msgdesc, bcp);
+ bau_process_message(&msgdesc, bcp);
+
msg++;
- if (msg > msgdesc.va_queue_last)
- msg = msgdesc.va_queue_first;
+ if (msg > msgdesc.queue_last)
+ msg = msgdesc.queue_first;
bcp->bau_msg_head = msg;
}
stat->d_time += (get_cycles() - time_start);
@@ -844,18 +1039,17 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
stat->d_nomsg++;
else if (count > 1)
stat->d_multmsg++;
+
ack_APIC_irq();
}
/*
- * uv_enable_timeouts
- *
- * Each target uvhub (i.e. a uvhub that has no cpu's) needs to have
+ * Each target uvhub (i.e. a uvhub that has cpu's) needs to have
* shootdown message timeouts enabled. The timeout does not cause
* an interrupt, but causes an error message to be returned to
* the sender.
*/
-static void uv_enable_timeouts(void)
+static void __init enable_timeouts(void)
{
int uvhub;
int nuvhubs;
@@ -869,47 +1063,44 @@ static void uv_enable_timeouts(void)
continue;
pnode = uv_blade_to_pnode(uvhub);
- mmr_image =
- uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL);
+ mmr_image = read_mmr_misc_control(pnode);
/*
* Set the timeout period and then lock it in, in three
* steps; captures and locks in the period.
*
* To program the period, the SOFT_ACK_MODE must be off.
*/
- mmr_image &= ~((unsigned long)1 <<
- UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
- uv_write_global_mmr64
- (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
+ mmr_image &= ~(1L << SOFTACK_MSHIFT);
+ write_mmr_misc_control(pnode, mmr_image);
/*
* Set the 4-bit period.
*/
- mmr_image &= ~((unsigned long)0xf <<
- UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
- mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD <<
- UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
- uv_write_global_mmr64
- (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
+ mmr_image &= ~((unsigned long)0xf << SOFTACK_PSHIFT);
+ mmr_image |= (SOFTACK_TIMEOUT_PERIOD << SOFTACK_PSHIFT);
+ write_mmr_misc_control(pnode, mmr_image);
/*
+ * UV1:
* Subsequent reversals of the timebase bit (3) cause an
* immediate timeout of one or all INTD resources as
* indicated in bits 2:0 (7 causes all of them to timeout).
*/
- mmr_image |= ((unsigned long)1 <<
- UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
- uv_write_global_mmr64
- (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
+ mmr_image |= (1L << SOFTACK_MSHIFT);
+ if (is_uv2_hub()) {
+ mmr_image |= (1L << UV2_LEG_SHFT);
+ mmr_image |= (1L << UV2_EXT_SHFT);
+ }
+ write_mmr_misc_control(pnode, mmr_image);
}
}
-static void *uv_ptc_seq_start(struct seq_file *file, loff_t *offset)
+static void *ptc_seq_start(struct seq_file *file, loff_t *offset)
{
if (*offset < num_possible_cpus())
return offset;
return NULL;
}
-static void *uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)
+static void *ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)
{
(*offset)++;
if (*offset < num_possible_cpus())
@@ -917,12 +1108,11 @@ static void *uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)
return NULL;
}
-static void uv_ptc_seq_stop(struct seq_file *file, void *data)
+static void ptc_seq_stop(struct seq_file *file, void *data)
{
}
-static inline unsigned long long
-microsec_2_cycles(unsigned long microsec)
+static inline unsigned long long usec_2_cycles(unsigned long microsec)
{
unsigned long ns;
unsigned long long cyc;
@@ -933,29 +1123,27 @@ microsec_2_cycles(unsigned long microsec)
}
/*
- * Display the statistics thru /proc.
+ * Display the statistics thru /proc/sgi_uv/ptc_statistics
* 'data' points to the cpu number
+ * Note: see the descriptions in stat_description[].
*/
-static int uv_ptc_seq_show(struct seq_file *file, void *data)
+static int ptc_seq_show(struct seq_file *file, void *data)
{
struct ptc_stats *stat;
int cpu;
cpu = *(loff_t *)data;
-
if (!cpu) {
seq_printf(file,
"# cpu sent stime self locals remotes ncpus localhub ");
seq_printf(file,
"remotehub numuvhubs numuvhubs16 numuvhubs8 ");
seq_printf(file,
- "numuvhubs4 numuvhubs2 numuvhubs1 dto ");
- seq_printf(file,
- "retries rok resetp resett giveup sto bz throt ");
+ "numuvhubs4 numuvhubs2 numuvhubs1 dto retries rok ");
seq_printf(file,
- "sw_ack recv rtime all ");
+ "resetp resett giveup sto bz throt swack recv rtime ");
seq_printf(file,
- "one mult none retry canc nocan reset rcan ");
+ "all one mult none retry canc nocan reset rcan ");
seq_printf(file,
"disable enable\n");
}
@@ -982,8 +1170,7 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
/* destination side statistics */
seq_printf(file,
"%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
- uv_read_global_mmr64(uv_cpu_to_pnode(cpu),
- UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
+ read_gmmr_sw_ack(uv_cpu_to_pnode(cpu)),
stat->d_requestee, cycles_2_us(stat->d_time),
stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
stat->d_nomsg, stat->d_retries, stat->d_canceled,
@@ -992,7 +1179,6 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
seq_printf(file, "%ld %ld\n",
stat->s_bau_disabled, stat->s_bau_reenabled);
}
-
return 0;
}
@@ -1000,18 +1186,18 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
* Display the tunables thru debugfs
*/
static ssize_t tunables_read(struct file *file, char __user *userbuf,
- size_t count, loff_t *ppos)
+ size_t count, loff_t *ppos)
{
char *buf;
int ret;
buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n",
- "max_bau_concurrent plugged_delay plugsb4reset",
+ "max_concur plugged_delay plugsb4reset",
"timeoutsb4reset ipi_reset_limit complete_threshold",
"congested_response_us congested_reps congested_period",
- max_bau_concurrent, plugged_delay, plugsb4reset,
+ max_concurr, plugged_delay, plugsb4reset,
timeoutsb4reset, ipi_reset_limit, complete_threshold,
- congested_response_us, congested_reps, congested_period);
+ congested_respns_us, congested_reps, congested_period);
if (!buf)
return -ENOMEM;
@@ -1022,13 +1208,16 @@ static ssize_t tunables_read(struct file *file, char __user *userbuf,
}
/*
- * -1: resetf the statistics
+ * handle a write to /proc/sgi_uv/ptc_statistics
+ * -1: reset the statistics
* 0: display meaning of the statistics
*/
-static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
- size_t count, loff_t *data)
+static ssize_t ptc_proc_write(struct file *file, const char __user *user,
+ size_t count, loff_t *data)
{
int cpu;
+ int i;
+ int elements;
long input_arg;
char optstr[64];
struct ptc_stats *stat;
@@ -1038,79 +1227,18 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
if (copy_from_user(optstr, user, count))
return -EFAULT;
optstr[count - 1] = '\0';
+
if (strict_strtol(optstr, 10, &input_arg) < 0) {
printk(KERN_DEBUG "%s is invalid\n", optstr);
return -EINVAL;
}
if (input_arg == 0) {
+ elements = sizeof(stat_description)/sizeof(*stat_description);
printk(KERN_DEBUG "# cpu: cpu number\n");
printk(KERN_DEBUG "Sender statistics:\n");
- printk(KERN_DEBUG
- "sent: number of shootdown messages sent\n");
- printk(KERN_DEBUG
- "stime: time spent sending messages\n");
- printk(KERN_DEBUG
- "numuvhubs: number of hubs targeted with shootdown\n");
- printk(KERN_DEBUG
- "numuvhubs16: number times 16 or more hubs targeted\n");
- printk(KERN_DEBUG
- "numuvhubs8: number times 8 or more hubs targeted\n");
- printk(KERN_DEBUG
- "numuvhubs4: number times 4 or more hubs targeted\n");
- printk(KERN_DEBUG
- "numuvhubs2: number times 2 or more hubs targeted\n");
- printk(KERN_DEBUG
- "numuvhubs1: number times 1 hub targeted\n");
- printk(KERN_DEBUG
- "numcpus: number of cpus targeted with shootdown\n");
- printk(KERN_DEBUG
- "dto: number of destination timeouts\n");
- printk(KERN_DEBUG
- "retries: destination timeout retries sent\n");
- printk(KERN_DEBUG
- "rok: : destination timeouts successfully retried\n");
- printk(KERN_DEBUG
- "resetp: ipi-style resource resets for plugs\n");
- printk(KERN_DEBUG
- "resett: ipi-style resource resets for timeouts\n");
- printk(KERN_DEBUG
- "giveup: fall-backs to ipi-style shootdowns\n");
- printk(KERN_DEBUG
- "sto: number of source timeouts\n");
- printk(KERN_DEBUG
- "bz: number of stay-busy's\n");
- printk(KERN_DEBUG
- "throt: number times spun in throttle\n");
- printk(KERN_DEBUG "Destination side statistics:\n");
- printk(KERN_DEBUG
- "sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
- printk(KERN_DEBUG
- "recv: shootdown messages received\n");
- printk(KERN_DEBUG
- "rtime: time spent processing messages\n");
- printk(KERN_DEBUG
- "all: shootdown all-tlb messages\n");
- printk(KERN_DEBUG
- "one: shootdown one-tlb messages\n");
- printk(KERN_DEBUG
- "mult: interrupts that found multiple messages\n");
- printk(KERN_DEBUG
- "none: interrupts that found no messages\n");
- printk(KERN_DEBUG
- "retry: number of retry messages processed\n");
- printk(KERN_DEBUG
- "canc: number messages canceled by retries\n");
- printk(KERN_DEBUG
- "nocan: number retries that found nothing to cancel\n");
- printk(KERN_DEBUG
- "reset: number of ipi-style reset requests processed\n");
- printk(KERN_DEBUG
- "rcan: number messages canceled by reset requests\n");
- printk(KERN_DEBUG
- "disable: number times use of the BAU was disabled\n");
- printk(KERN_DEBUG
- "enable: number times use of the BAU was re-enabled\n");
+ for (i = 0; i < elements; i++)
+ printk(KERN_DEBUG "%s\n", stat_description[i]);
} else if (input_arg == -1) {
for_each_present_cpu(cpu) {
stat = &per_cpu(ptcstats, cpu);
@@ -1137,27 +1265,18 @@ static int local_atoi(const char *name)
}
/*
- * set the tunables
- * 0 values reset them to defaults
+ * Parse the values written to /sys/kernel/debug/sgi_uv/bau_tunables.
+ * Zero values reset them to defaults.
*/
-static ssize_t tunables_write(struct file *file, const char __user *user,
- size_t count, loff_t *data)
+static int parse_tunables_write(struct bau_control *bcp, char *instr,
+ int count)
{
- int cpu;
- int cnt = 0;
- int val;
char *p;
char *q;
- char instr[64];
- struct bau_control *bcp;
-
- if (count == 0 || count > sizeof(instr)-1)
- return -EINVAL;
- if (copy_from_user(instr, user, count))
- return -EFAULT;
+ int cnt = 0;
+ int val;
+ int e = sizeof(tunables) / sizeof(*tunables);
- instr[count] = '\0';
- /* count the fields */
p = instr + strspn(instr, WHITESPACE);
q = p;
for (; *p; p = q + strspn(q, WHITESPACE)) {
@@ -1166,8 +1285,8 @@ static ssize_t tunables_write(struct file *file, const char __user *user,
if (q == p)
break;
}
- if (cnt != 9) {
- printk(KERN_INFO "bau tunable error: should be 9 numbers\n");
+ if (cnt != e) {
+ printk(KERN_INFO "bau tunable error: should be %d values\n", e);
return -EINVAL;
}
@@ -1179,97 +1298,81 @@ static ssize_t tunables_write(struct file *file, const char __user *user,
switch (cnt) {
case 0:
if (val == 0) {
- max_bau_concurrent = MAX_BAU_CONCURRENT;
- max_bau_concurrent_constant =
- MAX_BAU_CONCURRENT;
+ max_concurr = MAX_BAU_CONCURRENT;
+ max_concurr_const = MAX_BAU_CONCURRENT;
continue;
}
- bcp = &per_cpu(bau_control, smp_processor_id());
if (val < 1 || val > bcp->cpus_in_uvhub) {
printk(KERN_DEBUG
"Error: BAU max concurrent %d is invalid\n",
val);
return -EINVAL;
}
- max_bau_concurrent = val;
- max_bau_concurrent_constant = val;
+ max_concurr = val;
+ max_concurr_const = val;
continue;
- case 1:
- if (val == 0)
- plugged_delay = PLUGGED_DELAY;
- else
- plugged_delay = val;
- continue;
- case 2:
- if (val == 0)
- plugsb4reset = PLUGSB4RESET;
- else
- plugsb4reset = val;
- continue;
- case 3:
- if (val == 0)
- timeoutsb4reset = TIMEOUTSB4RESET;
- else
- timeoutsb4reset = val;
- continue;
- case 4:
- if (val == 0)
- ipi_reset_limit = IPI_RESET_LIMIT;
- else
- ipi_reset_limit = val;
- continue;
- case 5:
- if (val == 0)
- complete_threshold = COMPLETE_THRESHOLD;
- else
- complete_threshold = val;
- continue;
- case 6:
- if (val == 0)
- congested_response_us = CONGESTED_RESPONSE_US;
- else
- congested_response_us = val;
- continue;
- case 7:
- if (val == 0)
- congested_reps = CONGESTED_REPS;
- else
- congested_reps = val;
- continue;
- case 8:
+ default:
if (val == 0)
- congested_period = CONGESTED_PERIOD;
+ *tunables[cnt].tunp = tunables[cnt].deflt;
else
- congested_period = val;
+ *tunables[cnt].tunp = val;
continue;
}
if (q == p)
break;
}
+ return 0;
+}
+
+/*
+ * Handle a write to debugfs. (/sys/kernel/debug/sgi_uv/bau_tunables)
+ */
+static ssize_t tunables_write(struct file *file, const char __user *user,
+ size_t count, loff_t *data)
+{
+ int cpu;
+ int ret;
+ char instr[100];
+ struct bau_control *bcp;
+
+ if (count == 0 || count > sizeof(instr)-1)
+ return -EINVAL;
+ if (copy_from_user(instr, user, count))
+ return -EFAULT;
+
+ instr[count] = '\0';
+
+ cpu = get_cpu();
+ bcp = &per_cpu(bau_control, cpu);
+ ret = parse_tunables_write(bcp, instr, count);
+ put_cpu();
+ if (ret)
+ return ret;
+
for_each_present_cpu(cpu) {
bcp = &per_cpu(bau_control, cpu);
- bcp->max_bau_concurrent = max_bau_concurrent;
- bcp->max_bau_concurrent_constant = max_bau_concurrent;
- bcp->plugged_delay = plugged_delay;
- bcp->plugsb4reset = plugsb4reset;
- bcp->timeoutsb4reset = timeoutsb4reset;
- bcp->ipi_reset_limit = ipi_reset_limit;
- bcp->complete_threshold = complete_threshold;
- bcp->congested_response_us = congested_response_us;
- bcp->congested_reps = congested_reps;
- bcp->congested_period = congested_period;
+ bcp->max_concurr = max_concurr;
+ bcp->max_concurr_const = max_concurr;
+ bcp->plugged_delay = plugged_delay;
+ bcp->plugsb4reset = plugsb4reset;
+ bcp->timeoutsb4reset = timeoutsb4reset;
+ bcp->ipi_reset_limit = ipi_reset_limit;
+ bcp->complete_threshold = complete_threshold;
+ bcp->cong_response_us = congested_respns_us;
+ bcp->cong_reps = congested_reps;
+ bcp->cong_period = congested_period;
}
return count;
}
static const struct seq_operations uv_ptc_seq_ops = {
- .start = uv_ptc_seq_start,
- .next = uv_ptc_seq_next,
- .stop = uv_ptc_seq_stop,
- .show = uv_ptc_seq_show
+ .start = ptc_seq_start,
+ .next = ptc_seq_next,
+ .stop = ptc_seq_stop,
+ .show = ptc_seq_show
};
-static int uv_ptc_proc_open(struct inode *inode, struct file *file)
+static int ptc_proc_open(struct inode *inode, struct file *file)
{
return seq_open(file, &uv_ptc_seq_ops);
}
@@ -1280,9 +1383,9 @@ static int tunables_open(struct inode *inode, struct file *file)
}
static const struct file_operations proc_uv_ptc_operations = {
- .open = uv_ptc_proc_open,
+ .open = ptc_proc_open,
.read = seq_read,
- .write = uv_ptc_proc_write,
+ .write = ptc_proc_write,
.llseek = seq_lseek,
.release = seq_release,
};
@@ -1316,7 +1419,7 @@ static int __init uv_ptc_init(void)
return -EINVAL;
}
tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600,
- tunables_dir, NULL, &tunables_fops);
+ tunables_dir, NULL, &tunables_fops);
if (!tunables_file) {
printk(KERN_ERR "unable to create debugfs file %s\n",
UV_BAU_TUNABLES_FILE);
@@ -1326,53 +1429,52 @@ static int __init uv_ptc_init(void)
}
/*
- * initialize the sending side's sending buffers
+ * Initialize the sending side's sending buffers.
*/
-static void
-uv_activation_descriptor_init(int node, int pnode)
+static void activation_descriptor_init(int node, int pnode, int base_pnode)
{
int i;
int cpu;
unsigned long pa;
unsigned long m;
unsigned long n;
+ size_t dsize;
struct bau_desc *bau_desc;
struct bau_desc *bd2;
struct bau_control *bcp;
/*
- * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
- * per cpu; and one per cpu on the uvhub (UV_ADP_SIZE)
+ * each bau_desc is 64 bytes; there are 8 (ITEMS_PER_DESC)
+ * per cpu; and one per cpu on the uvhub (ADP_SZ)
*/
- bau_desc = kmalloc_node(sizeof(struct bau_desc) * UV_ADP_SIZE
- * UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
+ dsize = sizeof(struct bau_desc) * ADP_SZ * ITEMS_PER_DESC;
+ bau_desc = kmalloc_node(dsize, GFP_KERNEL, node);
BUG_ON(!bau_desc);
pa = uv_gpa(bau_desc); /* need the real nasid*/
n = pa >> uv_nshift;
m = pa & uv_mmask;
- uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
- (n << UV_DESC_BASE_PNODE_SHIFT | m));
-
+ /* the 14-bit pnode */
+ write_mmr_descriptor_base(pnode, (n << UV_DESC_PSHIFT | m));
/*
- * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
+ * Initializing all 8 (ITEMS_PER_DESC) descriptors for each
* cpu even though we only use the first one; one descriptor can
* describe a broadcast to 256 uv hubs.
*/
- for (i = 0, bd2 = bau_desc; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
- i++, bd2++) {
+ for (i = 0, bd2 = bau_desc; i < (ADP_SZ * ITEMS_PER_DESC); i++, bd2++) {
memset(bd2, 0, sizeof(struct bau_desc));
- bd2->header.sw_ack_flag = 1;
+ bd2->header.swack_flag = 1;
/*
- * base_dest_nodeid is the nasid of the first uvhub
- * in the partition. The bit map will indicate uvhub numbers,
- * which are 0-N in a partition. Pnodes are unique system-wide.
+ * The base_dest_nasid set in the message header is the nasid
+ * of the first uvhub in the partition. The bit map will
+ * indicate destination pnode numbers relative to that base.
+ * They may not be consecutive if nasid striding is being used.
*/
- bd2->header.base_dest_nodeid = UV_PNODE_TO_NASID(uv_partition_base_pnode);
- bd2->header.dest_subnodeid = 0x10; /* the LB */
- bd2->header.command = UV_NET_ENDPOINT_INTD;
- bd2->header.int_both = 1;
+ bd2->header.base_dest_nasid = UV_PNODE_TO_NASID(base_pnode);
+ bd2->header.dest_subnodeid = UV_LB_SUBNODEID;
+ bd2->header.command = UV_NET_ENDPOINT_INTD;
+ bd2->header.int_both = 1;
/*
* all others need to be set to zero:
* fairness chaining multilevel count replied_to
@@ -1392,57 +1494,55 @@ uv_activation_descriptor_init(int node, int pnode)
* - node is first node (kernel memory notion) on the uvhub
* - pnode is the uvhub's physical identifier
*/
-static void
-uv_payload_queue_init(int node, int pnode)
+static void pq_init(int node, int pnode)
{
- int pn;
int cpu;
+ size_t plsize;
char *cp;
- unsigned long pa;
- struct bau_payload_queue_entry *pqp;
- struct bau_payload_queue_entry *pqp_malloc;
+ void *vp;
+ unsigned long pn;
+ unsigned long first;
+ unsigned long pn_first;
+ unsigned long last;
+ struct bau_pq_entry *pqp;
struct bau_control *bcp;
- pqp = kmalloc_node((DEST_Q_SIZE + 1)
- * sizeof(struct bau_payload_queue_entry),
- GFP_KERNEL, node);
+ plsize = (DEST_Q_SIZE + 1) * sizeof(struct bau_pq_entry);
+ vp = kmalloc_node(plsize, GFP_KERNEL, node);
+ pqp = (struct bau_pq_entry *)vp;
BUG_ON(!pqp);
- pqp_malloc = pqp;
cp = (char *)pqp + 31;
- pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5);
+ pqp = (struct bau_pq_entry *)(((unsigned long)cp >> 5) << 5);
for_each_present_cpu(cpu) {
if (pnode != uv_cpu_to_pnode(cpu))
continue;
/* for every cpu on this pnode: */
bcp = &per_cpu(bau_control, cpu);
- bcp->va_queue_first = pqp;
- bcp->bau_msg_head = pqp;
- bcp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
+ bcp->queue_first = pqp;
+ bcp->bau_msg_head = pqp;
+ bcp->queue_last = pqp + (DEST_Q_SIZE - 1);
}
/*
* need the pnode of where the memory was really allocated
*/
- pa = uv_gpa(pqp);
- pn = pa >> uv_nshift;
- uv_write_global_mmr64(pnode,
- UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
- ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) |
- uv_physnodeaddr(pqp));
- uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
- uv_physnodeaddr(pqp));
- uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST,
- (unsigned long)
- uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1)));
+ pn = uv_gpa(pqp) >> uv_nshift;
+ first = uv_physnodeaddr(pqp);
+ pn_first = ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | first;
+ last = uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1));
+ write_mmr_payload_first(pnode, pn_first);
+ write_mmr_payload_tail(pnode, first);
+ write_mmr_payload_last(pnode, last);
+
/* in effect, all msg_type's are set to MSG_NOOP */
- memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE);
+ memset(pqp, 0, sizeof(struct bau_pq_entry) * DEST_Q_SIZE);
}
/*
* Initialization of each UV hub's structures
*/
-static void __init uv_init_uvhub(int uvhub, int vector)
+static void __init init_uvhub(int uvhub, int vector, int base_pnode)
{
int node;
int pnode;
@@ -1450,24 +1550,24 @@ static void __init uv_init_uvhub(int uvhub, int vector)
node = uvhub_to_first_node(uvhub);
pnode = uv_blade_to_pnode(uvhub);
- uv_activation_descriptor_init(node, pnode);
- uv_payload_queue_init(node, pnode);
+
+ activation_descriptor_init(node, pnode, base_pnode);
+
+ pq_init(node, pnode);
/*
- * the below initialization can't be in firmware because the
- * messaging IRQ will be determined by the OS
+ * The below initialization can't be in firmware because the
+ * messaging IRQ will be determined by the OS.
*/
apicid = uvhub_to_first_apicid(uvhub) | uv_apicid_hibits;
- uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
- ((apicid << 32) | vector));
+ write_mmr_data_config(pnode, ((apicid << 32) | vector));
}
/*
* We will set BAU_MISC_CONTROL with a timeout period.
* But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT.
- * So the destination timeout period has be be calculated from them.
+ * So the destination timeout period has to be calculated from them.
*/
-static int
-calculate_destination_timeout(void)
+static int calculate_destination_timeout(void)
{
unsigned long mmr_image;
int mult1;
@@ -1477,136 +1577,240 @@ calculate_destination_timeout(void)
int ret;
unsigned long ts_ns;
- mult1 = UV_INTD_SOFT_ACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK;
- mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
- index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
- mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
- mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK;
- base = timeout_base_ns[index];
- ts_ns = base * mult1 * mult2;
- ret = ts_ns / 1000;
+ if (is_uv1_hub()) {
+ mult1 = SOFTACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK;
+ mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
+ index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
+ mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
+ mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK;
+ base = timeout_base_ns[index];
+ ts_ns = base * mult1 * mult2;
+ ret = ts_ns / 1000;
+ } else {
+ /* 4 bits 0/1 for 10/80us, 3 bits of multiplier */
+ mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
+ mmr_image = (mmr_image & UV_SA_MASK) >> UV_SA_SHFT;
+ if (mmr_image & (1L << UV2_ACK_UNITS_SHFT))
+ mult1 = 80;
+ else
+ mult1 = 10;
+ base = mmr_image & UV2_ACK_MASK;
+ ret = mult1 * base;
+ }
return ret;
}
+static void __init init_per_cpu_tunables(void)
+{
+ int cpu;
+ struct bau_control *bcp;
+
+ for_each_present_cpu(cpu) {
+ bcp = &per_cpu(bau_control, cpu);
+ bcp->baudisabled = 0;
+ bcp->statp = &per_cpu(ptcstats, cpu);
+ /* time interval to catch a hardware stay-busy bug */
+ bcp->timeout_interval = usec_2_cycles(2*timeout_us);
+ bcp->max_concurr = max_concurr;
+ bcp->max_concurr_const = max_concurr;
+ bcp->plugged_delay = plugged_delay;
+ bcp->plugsb4reset = plugsb4reset;
+ bcp->timeoutsb4reset = timeoutsb4reset;
+ bcp->ipi_reset_limit = ipi_reset_limit;
+ bcp->complete_threshold = complete_threshold;
+ bcp->cong_response_us = congested_respns_us;
+ bcp->cong_reps = congested_reps;
+ bcp->cong_period = congested_period;
+ }
+}
+
/*
- * initialize the bau_control structure for each cpu
+ * Scan all cpus to collect blade and socket summaries.
*/
-static int __init uv_init_per_cpu(int nuvhubs)
+static int __init get_cpu_topology(int base_pnode,
+ struct uvhub_desc *uvhub_descs,
+ unsigned char *uvhub_mask)
{
- int i;
int cpu;
int pnode;
int uvhub;
- int have_hmaster;
- short socket = 0;
- unsigned short socket_mask;
- unsigned char *uvhub_mask;
+ int socket;
struct bau_control *bcp;
struct uvhub_desc *bdp;
struct socket_desc *sdp;
- struct bau_control *hmaster = NULL;
- struct bau_control *smaster = NULL;
- struct socket_desc {
- short num_cpus;
- short cpu_number[MAX_CPUS_PER_SOCKET];
- };
- struct uvhub_desc {
- unsigned short socket_mask;
- short num_cpus;
- short uvhub;
- short pnode;
- struct socket_desc socket[2];
- };
- struct uvhub_desc *uvhub_descs;
-
- timeout_us = calculate_destination_timeout();
- uvhub_descs = kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
- memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
- uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
for_each_present_cpu(cpu) {
bcp = &per_cpu(bau_control, cpu);
+
memset(bcp, 0, sizeof(struct bau_control));
+
pnode = uv_cpu_hub_info(cpu)->pnode;
+ if ((pnode - base_pnode) >= UV_DISTRIBUTION_SIZE) {
+ printk(KERN_EMERG
+ "cpu %d pnode %d-%d beyond %d; BAU disabled\n",
+ cpu, pnode, base_pnode, UV_DISTRIBUTION_SIZE);
+ return 1;
+ }
+
+ bcp->osnode = cpu_to_node(cpu);
+ bcp->partition_base_pnode = base_pnode;
+
uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
*(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8));
bdp = &uvhub_descs[uvhub];
+
bdp->num_cpus++;
bdp->uvhub = uvhub;
bdp->pnode = pnode;
+
/* kludge: 'assuming' one node per socket, and assuming that
disabling a socket just leaves a gap in node numbers */
- socket = (cpu_to_node(cpu) & 1);
+ socket = bcp->osnode & 1;
bdp->socket_mask |= (1 << socket);
sdp = &bdp->socket[socket];
sdp->cpu_number[sdp->num_cpus] = cpu;
sdp->num_cpus++;
if (sdp->num_cpus > MAX_CPUS_PER_SOCKET) {
- printk(KERN_EMERG "%d cpus per socket invalid\n", sdp->num_cpus);
+ printk(KERN_EMERG "%d cpus per socket invalid\n",
+ sdp->num_cpus);
return 1;
}
}
+ return 0;
+}
+
+/*
+ * Each socket is to get a local array of pnodes/hubs.
+ */
+static void make_per_cpu_thp(struct bau_control *smaster)
+{
+ int cpu;
+ size_t hpsz = sizeof(struct hub_and_pnode) * num_possible_cpus();
+
+ smaster->thp = kmalloc_node(hpsz, GFP_KERNEL, smaster->osnode);
+ memset(smaster->thp, 0, hpsz);
+ for_each_present_cpu(cpu) {
+ smaster->thp[cpu].pnode = uv_cpu_hub_info(cpu)->pnode;
+ smaster->thp[cpu].uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
+ }
+}
+
+/*
+ * Each uvhub is to get a local cpumask.
+ */
+static void make_per_hub_cpumask(struct bau_control *hmaster)
+{
+ int sz = sizeof(cpumask_t);
+
+ hmaster->cpumask = kzalloc_node(sz, GFP_KERNEL, hmaster->osnode);
+}
+
+/*
+ * Initialize all the per_cpu information for the cpu's on a given socket,
+ * given what has been gathered into the socket_desc struct.
+ * And reports the chosen hub and socket masters back to the caller.
+ */
+static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp,
+ struct bau_control **smasterp,
+ struct bau_control **hmasterp)
+{
+ int i;
+ int cpu;
+ struct bau_control *bcp;
+
+ for (i = 0; i < sdp->num_cpus; i++) {
+ cpu = sdp->cpu_number[i];
+ bcp = &per_cpu(bau_control, cpu);
+ bcp->cpu = cpu;
+ if (i == 0) {
+ *smasterp = bcp;
+ if (!(*hmasterp))
+ *hmasterp = bcp;
+ }
+ bcp->cpus_in_uvhub = bdp->num_cpus;
+ bcp->cpus_in_socket = sdp->num_cpus;
+ bcp->socket_master = *smasterp;
+ bcp->uvhub = bdp->uvhub;
+ bcp->uvhub_master = *hmasterp;
+ bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+ if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
+ printk(KERN_EMERG "%d cpus per uvhub invalid\n",
+ bcp->uvhub_cpu);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Summarize the blade and socket topology into the per_cpu structures.
+ */
+static int __init summarize_uvhub_sockets(int nuvhubs,
+ struct uvhub_desc *uvhub_descs,
+ unsigned char *uvhub_mask)
+{
+ int socket;
+ int uvhub;
+ unsigned short socket_mask;
+
for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
+ struct uvhub_desc *bdp;
+ struct bau_control *smaster = NULL;
+ struct bau_control *hmaster = NULL;
+
if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))
continue;
- have_hmaster = 0;
+
bdp = &uvhub_descs[uvhub];
socket_mask = bdp->socket_mask;
socket = 0;
while (socket_mask) {
- if (!(socket_mask & 1))
- goto nextsocket;
- sdp = &bdp->socket[socket];
- for (i = 0; i < sdp->num_cpus; i++) {
- cpu = sdp->cpu_number[i];
- bcp = &per_cpu(bau_control, cpu);
- bcp->cpu = cpu;
- if (i == 0) {
- smaster = bcp;
- if (!have_hmaster) {
- have_hmaster++;
- hmaster = bcp;
- }
- }
- bcp->cpus_in_uvhub = bdp->num_cpus;
- bcp->cpus_in_socket = sdp->num_cpus;
- bcp->socket_master = smaster;
- bcp->uvhub = bdp->uvhub;
- bcp->uvhub_master = hmaster;
- bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->
- blade_processor_id;
- if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
- printk(KERN_EMERG
- "%d cpus per uvhub invalid\n",
- bcp->uvhub_cpu);
+ struct socket_desc *sdp;
+ if ((socket_mask & 1)) {
+ sdp = &bdp->socket[socket];
+ if (scan_sock(sdp, bdp, &smaster, &hmaster))
return 1;
- }
+ make_per_cpu_thp(smaster);
}
-nextsocket:
socket++;
socket_mask = (socket_mask >> 1);
}
+ make_per_hub_cpumask(hmaster);
}
+ return 0;
+}
+
+/*
+ * initialize the bau_control structure for each cpu
+ */
+static int __init init_per_cpu(int nuvhubs, int base_part_pnode)
+{
+ unsigned char *uvhub_mask;
+ void *vp;
+ struct uvhub_desc *uvhub_descs;
+
+ timeout_us = calculate_destination_timeout();
+
+ vp = kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
+ uvhub_descs = (struct uvhub_desc *)vp;
+ memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
+ uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
+
+ if (get_cpu_topology(base_part_pnode, uvhub_descs, uvhub_mask))
+ goto fail;
+
+ if (summarize_uvhub_sockets(nuvhubs, uvhub_descs, uvhub_mask))
+ goto fail;
+
kfree(uvhub_descs);
kfree(uvhub_mask);
- for_each_present_cpu(cpu) {
- bcp = &per_cpu(bau_control, cpu);
- bcp->baudisabled = 0;
- bcp->statp = &per_cpu(ptcstats, cpu);
- /* time interval to catch a hardware stay-busy bug */
- bcp->timeout_interval = microsec_2_cycles(2*timeout_us);
- bcp->max_bau_concurrent = max_bau_concurrent;
- bcp->max_bau_concurrent_constant = max_bau_concurrent;
- bcp->plugged_delay = plugged_delay;
- bcp->plugsb4reset = plugsb4reset;
- bcp->timeoutsb4reset = timeoutsb4reset;
- bcp->ipi_reset_limit = ipi_reset_limit;
- bcp->complete_threshold = complete_threshold;
- bcp->congested_response_us = congested_response_us;
- bcp->congested_reps = congested_reps;
- bcp->congested_period = congested_period;
- }
+ init_per_cpu_tunables();
return 0;
+
+fail:
+ kfree(uvhub_descs);
+ kfree(uvhub_mask);
+ return 1;
}
/*
@@ -1618,8 +1822,9 @@ static int __init uv_bau_init(void)
int pnode;
int nuvhubs;
int cur_cpu;
+ int cpus;
int vector;
- unsigned long mmr;
+ cpumask_var_t *mask;
if (!is_uv_system())
return 0;
@@ -1627,45 +1832,47 @@ static int __init uv_bau_init(void)
if (nobau)
return 0;
- for_each_possible_cpu(cur_cpu)
- zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
- GFP_KERNEL, cpu_to_node(cur_cpu));
+ for_each_possible_cpu(cur_cpu) {
+ mask = &per_cpu(uv_flush_tlb_mask, cur_cpu);
+ zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu));
+ }
uv_nshift = uv_hub_info->m_val;
uv_mmask = (1UL << uv_hub_info->m_val) - 1;
nuvhubs = uv_num_possible_blades();
spin_lock_init(&disable_lock);
- congested_cycles = microsec_2_cycles(congested_response_us);
+ congested_cycles = usec_2_cycles(congested_respns_us);
- if (uv_init_per_cpu(nuvhubs)) {
+ uv_base_pnode = 0x7fffffff;
+ for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
+ cpus = uv_blade_nr_possible_cpus(uvhub);
+ if (cpus && (uv_blade_to_pnode(uvhub) < uv_base_pnode))
+ uv_base_pnode = uv_blade_to_pnode(uvhub);
+ }
+
+ if (init_per_cpu(nuvhubs, uv_base_pnode)) {
nobau = 1;
return 0;
}
- uv_partition_base_pnode = 0x7fffffff;
- for (uvhub = 0; uvhub < nuvhubs; uvhub++)
- if (uv_blade_nr_possible_cpus(uvhub) &&
- (uv_blade_to_pnode(uvhub) < uv_partition_base_pnode))
- uv_partition_base_pnode = uv_blade_to_pnode(uvhub);
-
vector = UV_BAU_MESSAGE;
for_each_possible_blade(uvhub)
if (uv_blade_nr_possible_cpus(uvhub))
- uv_init_uvhub(uvhub, vector);
+ init_uvhub(uvhub, vector, uv_base_pnode);
- uv_enable_timeouts();
+ enable_timeouts();
alloc_intr_gate(vector, uv_bau_message_intr1);
for_each_possible_blade(uvhub) {
if (uv_blade_nr_possible_cpus(uvhub)) {
+ unsigned long val;
+ unsigned long mmr;
pnode = uv_blade_to_pnode(uvhub);
/* INIT the bau */
- uv_write_global_mmr64(pnode,
- UVH_LB_BAU_SB_ACTIVATION_CONTROL,
- ((unsigned long)1 << 63));
+ val = 1L << 63;
+ write_gmmr_activation(pnode, val);
mmr = 1; /* should be 1 to broadcast to both sockets */
- uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST,
- mmr);
+ write_mmr_data_broadcast(pnode, mmr);
}
}
diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
index 9daf5d1af9f..9f29a01ee1b 100644
--- a/arch/x86/platform/uv/uv_time.c
+++ b/arch/x86/platform/uv/uv_time.c
@@ -40,7 +40,6 @@ static struct clocksource clocksource_uv = {
.rating = 400,
.read = uv_read_rtc,
.mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK,
- .shift = 10,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
@@ -100,8 +99,12 @@ static void uv_rtc_send_IPI(int cpu)
/* Check for an RTC interrupt pending */
static int uv_intr_pending(int pnode)
{
- return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) &
- UVH_EVENT_OCCURRED0_RTC1_MASK;
+ if (is_uv1_hub())
+ return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) &
+ UV1H_EVENT_OCCURRED0_RTC1_MASK;
+ else
+ return uv_read_global_mmr64(pnode, UV2H_EVENT_OCCURRED2) &
+ UV2H_EVENT_OCCURRED2_RTC_1_MASK;
}
/* Setup interrupt and return non-zero if early expiration occurred. */
@@ -115,8 +118,12 @@ static int uv_setup_intr(int cpu, u64 expires)
UVH_RTC1_INT_CONFIG_M_MASK);
uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L);
- uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
- UVH_EVENT_OCCURRED0_RTC1_MASK);
+ if (is_uv1_hub())
+ uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
+ UV1H_EVENT_OCCURRED0_RTC1_MASK);
+ else
+ uv_write_global_mmr64(pnode, UV2H_EVENT_OCCURRED2_ALIAS,
+ UV2H_EVENT_OCCURRED2_RTC_1_MASK);
val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
((u64)apicid << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
@@ -372,14 +379,11 @@ static __init int uv_rtc_setup_clock(void)
if (!is_uv_system())
return -ENODEV;
- clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
- clocksource_uv.shift);
-
/* If single blade, prefer tsc */
if (uv_num_possible_blades() == 1)
clocksource_uv.rating = 250;
- rc = clocksource_register(&clocksource_uv);
+ rc = clocksource_register_hz(&clocksource_uv, sn_rtc_cycles_per_second);
if (rc)
printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc);
else
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index b6552b189bc..5d179502a52 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -11,7 +11,7 @@ vdso-install-$(VDSO32-y) += $(vdso32-images)
# files to link into the vdso
-vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vvar.o
+vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
# files to link into kernel
obj-$(VDSO64-y) += vma.o vdso.o
@@ -26,6 +26,7 @@ targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y)
export CPPFLAGS_vdso.lds += -P -C
VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \
+ -Wl,--no-undefined \
-Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so
@@ -37,11 +38,24 @@ $(obj)/%.so: OBJCOPYFLAGS := -S
$(obj)/%.so: $(obj)/%.so.dbg FORCE
$(call if_changed,objcopy)
+#
+# Don't omit frame pointers for ease of userspace debugging, but do
+# optimize sibling calls.
+#
CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
- $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector)
+ $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \
+ -fno-omit-frame-pointer -foptimize-sibling-calls
$(vobjs): KBUILD_CFLAGS += $(CFL)
+#
+# vDSO code runs in userspace and -pg doesn't help with profiling anyway.
+#
+CFLAGS_REMOVE_vdso-note.o = -pg
+CFLAGS_REMOVE_vclock_gettime.o = -pg
+CFLAGS_REMOVE_vgetcpu.o = -pg
+CFLAGS_REMOVE_vvar.o = -pg
+
targets += vdso-syms.lds
obj-$(VDSO64-y) += vdso-syms.lds
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index ee55754cc3c..6bc0e723b6e 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -2,11 +2,10 @@
* Copyright 2006 Andi Kleen, SUSE Labs.
* Subject to the GNU Public License, v.2
*
- * Fast user context implementation of clock_gettime and gettimeofday.
+ * Fast user context implementation of clock_gettime, gettimeofday, and time.
*
* The code should have no internal unresolved relocations.
* Check with readelf after changing.
- * Also alternative() doesn't work.
*/
/* Disable profiling for userspace code: */
@@ -17,14 +16,51 @@
#include <linux/time.h>
#include <linux/string.h>
#include <asm/vsyscall.h>
+#include <asm/fixmap.h>
#include <asm/vgtod.h>
#include <asm/timex.h>
#include <asm/hpet.h>
#include <asm/unistd.h>
#include <asm/io.h>
-#include "vextern.h"
-#define gtod vdso_vsyscall_gtod_data
+#define gtod (&VVAR(vsyscall_gtod_data))
+
+notrace static cycle_t vread_tsc(void)
+{
+ cycle_t ret;
+ u64 last;
+
+ /*
+ * Empirically, a fence (of type that depends on the CPU)
+ * before rdtsc is enough to ensure that rdtsc is ordered
+ * with respect to loads. The various CPU manuals are unclear
+ * as to whether rdtsc can be reordered with later loads,
+ * but no one has ever seen it happen.
+ */
+ rdtsc_barrier();
+ ret = (cycle_t)vget_cycles();
+
+ last = VVAR(vsyscall_gtod_data).clock.cycle_last;
+
+ if (likely(ret >= last))
+ return ret;
+
+ /*
+ * GCC likes to generate cmov here, but this branch is extremely
+ * predictable (it's just a funciton of time and the likely is
+ * very likely) and there's a data dependence, so force GCC
+ * to generate a branch instead. I don't barrier() because
+ * we don't actually need a barrier, and if this function
+ * ever gets inlined it will generate worse code.
+ */
+ asm volatile ("");
+ return last;
+}
+
+static notrace cycle_t vread_hpet(void)
+{
+ return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
+}
notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
{
@@ -37,9 +73,12 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
notrace static inline long vgetns(void)
{
long v;
- cycles_t (*vread)(void);
- vread = gtod->clock.vread;
- v = (vread() - gtod->clock.cycle_last) & gtod->clock.mask;
+ cycles_t cycles;
+ if (gtod->clock.vclock_mode == VCLOCK_TSC)
+ cycles = vread_tsc();
+ else
+ cycles = vread_hpet();
+ v = (cycles - gtod->clock.cycle_last) & gtod->clock.mask;
return (v * gtod->clock.mult) >> gtod->clock.shift;
}
@@ -56,22 +95,6 @@ notrace static noinline int do_realtime(struct timespec *ts)
return 0;
}
-/* Copy of the version in kernel/time.c which we cannot directly access */
-notrace static void
-vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
-{
- while (nsec >= NSEC_PER_SEC) {
- nsec -= NSEC_PER_SEC;
- ++sec;
- }
- while (nsec < 0) {
- nsec += NSEC_PER_SEC;
- --sec;
- }
- ts->tv_sec = sec;
- ts->tv_nsec = nsec;
-}
-
notrace static noinline int do_monotonic(struct timespec *ts)
{
unsigned long seq, ns, secs;
@@ -82,7 +105,17 @@ notrace static noinline int do_monotonic(struct timespec *ts)
secs += gtod->wall_to_monotonic.tv_sec;
ns += gtod->wall_to_monotonic.tv_nsec;
} while (unlikely(read_seqretry(&gtod->lock, seq)));
- vset_normalized_timespec(ts, secs, ns);
+
+ /* wall_time_nsec, vgetns(), and wall_to_monotonic.tv_nsec
+ * are all guaranteed to be nonnegative.
+ */
+ while (ns >= NSEC_PER_SEC) {
+ ns -= NSEC_PER_SEC;
+ ++secs;
+ }
+ ts->tv_sec = secs;
+ ts->tv_nsec = ns;
+
return 0;
}
@@ -107,27 +140,37 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts)
secs += gtod->wall_to_monotonic.tv_sec;
ns += gtod->wall_to_monotonic.tv_nsec;
} while (unlikely(read_seqretry(&gtod->lock, seq)));
- vset_normalized_timespec(ts, secs, ns);
+
+ /* wall_time_nsec and wall_to_monotonic.tv_nsec are
+ * guaranteed to be between 0 and NSEC_PER_SEC.
+ */
+ if (ns >= NSEC_PER_SEC) {
+ ns -= NSEC_PER_SEC;
+ ++secs;
+ }
+ ts->tv_sec = secs;
+ ts->tv_nsec = ns;
+
return 0;
}
notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
{
- if (likely(gtod->sysctl_enabled))
- switch (clock) {
- case CLOCK_REALTIME:
- if (likely(gtod->clock.vread))
- return do_realtime(ts);
- break;
- case CLOCK_MONOTONIC:
- if (likely(gtod->clock.vread))
- return do_monotonic(ts);
- break;
- case CLOCK_REALTIME_COARSE:
- return do_realtime_coarse(ts);
- case CLOCK_MONOTONIC_COARSE:
- return do_monotonic_coarse(ts);
- }
+ switch (clock) {
+ case CLOCK_REALTIME:
+ if (likely(gtod->clock.vclock_mode != VCLOCK_NONE))
+ return do_realtime(ts);
+ break;
+ case CLOCK_MONOTONIC:
+ if (likely(gtod->clock.vclock_mode != VCLOCK_NONE))
+ return do_monotonic(ts);
+ break;
+ case CLOCK_REALTIME_COARSE:
+ return do_realtime_coarse(ts);
+ case CLOCK_MONOTONIC_COARSE:
+ return do_monotonic_coarse(ts);
+ }
+
return vdso_fallback_gettime(clock, ts);
}
int clock_gettime(clockid_t, struct timespec *)
@@ -136,7 +179,7 @@ int clock_gettime(clockid_t, struct timespec *)
notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
{
long ret;
- if (likely(gtod->sysctl_enabled && gtod->clock.vread)) {
+ if (likely(gtod->clock.vclock_mode != VCLOCK_NONE)) {
if (likely(tv != NULL)) {
BUILD_BUG_ON(offsetof(struct timeval, tv_usec) !=
offsetof(struct timespec, tv_nsec) ||
@@ -157,3 +200,19 @@ notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
}
int gettimeofday(struct timeval *, struct timezone *)
__attribute__((weak, alias("__vdso_gettimeofday")));
+
+/*
+ * This will break when the xtime seconds get inaccurate, but that is
+ * unlikely
+ */
+notrace time_t __vdso_time(time_t *t)
+{
+ /* This is atomic on x86_64 so we don't need any locks. */
+ time_t result = ACCESS_ONCE(VVAR(vsyscall_gtod_data).wall_time_sec);
+
+ if (t)
+ *t = result;
+ return result;
+}
+int time(time_t *t)
+ __attribute__((weak, alias("__vdso_time")));
diff --git a/arch/x86/vdso/vdso.S b/arch/x86/vdso/vdso.S
index 1d3aa6b8718..1b979c12ba8 100644
--- a/arch/x86/vdso/vdso.S
+++ b/arch/x86/vdso/vdso.S
@@ -1,10 +1,21 @@
+#include <asm/page_types.h>
+#include <linux/linkage.h>
#include <linux/init.h>
-__INITDATA
+__PAGE_ALIGNED_DATA
.globl vdso_start, vdso_end
+ .align PAGE_SIZE
vdso_start:
.incbin "arch/x86/vdso/vdso.so"
vdso_end:
-__FINIT
+.previous
+
+ .globl vdso_pages
+ .bss
+ .align 8
+ .type vdso_pages, @object
+vdso_pages:
+ .zero (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE * 8
+ .size vdso_pages, .-vdso_pages
diff --git a/arch/x86/vdso/vdso.lds.S b/arch/x86/vdso/vdso.lds.S
index 4e5dd3b4de7..b96b2677cad 100644
--- a/arch/x86/vdso/vdso.lds.S
+++ b/arch/x86/vdso/vdso.lds.S
@@ -23,15 +23,10 @@ VERSION {
__vdso_gettimeofday;
getcpu;
__vdso_getcpu;
+ time;
+ __vdso_time;
local: *;
};
}
VDSO64_PRELINK = VDSO_PRELINK;
-
-/*
- * Define VDSO64_x for each VEXTERN(x), for use via VDSO64_SYMBOL.
- */
-#define VEXTERN(x) VDSO64_ ## x = vdso_ ## x;
-#include "vextern.h"
-#undef VEXTERN
diff --git a/arch/x86/vdso/vextern.h b/arch/x86/vdso/vextern.h
deleted file mode 100644
index 1683ba2ae3e..00000000000
--- a/arch/x86/vdso/vextern.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef VEXTERN
-#include <asm/vsyscall.h>
-#define VEXTERN(x) \
- extern typeof(x) *vdso_ ## x __attribute__((visibility("hidden")));
-#endif
-
-#define VMAGIC 0xfeedbabeabcdefabUL
-
-/* Any kernel variables used in the vDSO must be exported in the main
- kernel's vmlinux.lds.S/vsyscall.h/proper __section and
- put into vextern.h and be referenced as a pointer with vdso prefix.
- The main kernel later fills in the values. */
-
-VEXTERN(jiffies)
-VEXTERN(vgetcpu_mode)
-VEXTERN(vsyscall_gtod_data)
diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c
index 9fbc6b20026..5463ad55857 100644
--- a/arch/x86/vdso/vgetcpu.c
+++ b/arch/x86/vdso/vgetcpu.c
@@ -11,14 +11,13 @@
#include <linux/time.h>
#include <asm/vsyscall.h>
#include <asm/vgtod.h>
-#include "vextern.h"
notrace long
__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
{
unsigned int p;
- if (*vdso_vgetcpu_mode == VGETCPU_RDTSCP) {
+ if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) {
/* Load per CPU data from RDTSCP */
native_read_tscp(&p);
} else {
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 4b5d26f108b..316fbca3490 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -14,68 +14,61 @@
#include <asm/vgtod.h>
#include <asm/proto.h>
#include <asm/vdso.h>
-
-#include "vextern.h" /* Just for VMAGIC. */
-#undef VEXTERN
+#include <asm/page.h>
unsigned int __read_mostly vdso_enabled = 1;
extern char vdso_start[], vdso_end[];
extern unsigned short vdso_sync_cpuid;
-static struct page **vdso_pages;
+extern struct page *vdso_pages[];
static unsigned vdso_size;
-static inline void *var_ref(void *p, char *name)
+static void __init patch_vdso(void *vdso, size_t len)
{
- if (*(void **)p != (void *)VMAGIC) {
- printk("VDSO: variable %s broken\n", name);
- vdso_enabled = 0;
+ Elf64_Ehdr *hdr = vdso;
+ Elf64_Shdr *sechdrs, *alt_sec = 0;
+ char *secstrings;
+ void *alt_data;
+ int i;
+
+ BUG_ON(len < sizeof(Elf64_Ehdr));
+ BUG_ON(memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0);
+
+ sechdrs = (void *)hdr + hdr->e_shoff;
+ secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+
+ for (i = 1; i < hdr->e_shnum; i++) {
+ Elf64_Shdr *shdr = &sechdrs[i];
+ if (!strcmp(secstrings + shdr->sh_name, ".altinstructions")) {
+ alt_sec = shdr;
+ goto found;
+ }
}
- return p;
+
+ /* If we get here, it's probably a bug. */
+ pr_warning("patch_vdso: .altinstructions not found\n");
+ return; /* nothing to patch */
+
+found:
+ alt_data = (void *)hdr + alt_sec->sh_offset;
+ apply_alternatives(alt_data, alt_data + alt_sec->sh_size);
}
-static int __init init_vdso_vars(void)
+static int __init init_vdso(void)
{
int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE;
int i;
- char *vbase;
- vdso_size = npages << PAGE_SHIFT;
- vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL);
- if (!vdso_pages)
- goto oom;
- for (i = 0; i < npages; i++) {
- struct page *p;
- p = alloc_page(GFP_KERNEL);
- if (!p)
- goto oom;
- vdso_pages[i] = p;
- copy_page(page_address(p), vdso_start + i*PAGE_SIZE);
- }
-
- vbase = vmap(vdso_pages, npages, 0, PAGE_KERNEL);
- if (!vbase)
- goto oom;
+ patch_vdso(vdso_start, vdso_end - vdso_start);
- if (memcmp(vbase, "\177ELF", 4)) {
- printk("VDSO: I'm broken; not ELF\n");
- vdso_enabled = 0;
- }
+ vdso_size = npages << PAGE_SHIFT;
+ for (i = 0; i < npages; i++)
+ vdso_pages[i] = virt_to_page(vdso_start + i*PAGE_SIZE);
-#define VEXTERN(x) \
- *(typeof(__ ## x) **) var_ref(VDSO64_SYMBOL(vbase, x), #x) = &__ ## x;
-#include "vextern.h"
-#undef VEXTERN
- vunmap(vbase);
return 0;
-
- oom:
- printk("Cannot allocate vdso\n");
- vdso_enabled = 0;
- return -ENOMEM;
}
-subsys_initcall(init_vdso_vars);
+subsys_initcall(init_vdso);
struct linux_binprm;
diff --git a/arch/x86/vdso/vvar.c b/arch/x86/vdso/vvar.c
deleted file mode 100644
index 1b7e703684f..00000000000
--- a/arch/x86/vdso/vvar.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/* Define pointer to external vDSO variables.
- These are part of the vDSO. The kernel fills in the real addresses
- at boot time. This is done because when the vdso is linked the
- kernel isn't yet and we don't know the final addresses. */
-#include <linux/kernel.h>
-#include <linux/time.h>
-#include <asm/vsyscall.h>
-#include <asm/timex.h>
-#include <asm/vgtod.h>
-
-#define VEXTERN(x) typeof (__ ## x) *const vdso_ ## x = (void *)VMAGIC;
-#include "vextern.h"
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 17c565de3d6..45e94aca5bc 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -15,8 +15,10 @@ obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
grant-table.o suspend.o platform-pci-unplug.o \
p2m.o
+obj-$(CONFIG_FUNCTION_TRACER) += trace.o
+
obj-$(CONFIG_SMP) += smp.o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
-
+obj-$(CONFIG_XEN_DOM0) += vga.o
obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index e3c6a06cf72..974a528458a 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -235,7 +235,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
*dx &= maskedx;
}
-static __init void xen_init_cpuid_mask(void)
+static void __init xen_init_cpuid_mask(void)
{
unsigned int ax, bx, cx, dx;
unsigned int xsave_mask;
@@ -341,6 +341,8 @@ static void xen_set_ldt(const void *addr, unsigned entries)
struct mmuext_op *op;
struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+ trace_xen_cpu_set_ldt(addr, entries);
+
op = mcs.args;
op->cmd = MMUEXT_SET_LDT;
op->arg1.linear_addr = (unsigned long)addr;
@@ -400,7 +402,7 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
/*
* load_gdt for early boot, when the gdt is only mapped once
*/
-static __init void xen_load_gdt_boot(const struct desc_ptr *dtr)
+static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
{
unsigned long va = dtr->address;
unsigned int size = dtr->size + 1;
@@ -496,6 +498,8 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);
u64 entry = *(u64 *)ptr;
+ trace_xen_cpu_write_ldt_entry(dt, entrynum, entry);
+
preempt_disable();
xen_mc_flush();
@@ -565,6 +569,8 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
unsigned long p = (unsigned long)&dt[entrynum];
unsigned long start, end;
+ trace_xen_cpu_write_idt_entry(dt, entrynum, g);
+
preempt_disable();
start = __this_cpu_read(idt_desc.address);
@@ -619,6 +625,8 @@ static void xen_load_idt(const struct desc_ptr *desc)
static DEFINE_SPINLOCK(lock);
static struct trap_info traps[257];
+ trace_xen_cpu_load_idt(desc);
+
spin_lock(&lock);
__get_cpu_var(idt_desc) = *desc;
@@ -637,6 +645,8 @@ static void xen_load_idt(const struct desc_ptr *desc)
static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
const void *desc, int type)
{
+ trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
+
preempt_disable();
switch (type) {
@@ -662,9 +672,11 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
* Version of write_gdt_entry for use at early boot-time needed to
* update an entry as simply as possible.
*/
-static __init void xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
+static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
const void *desc, int type)
{
+ trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
+
switch (type) {
case DESC_LDT:
case DESC_TSS:
@@ -684,7 +696,9 @@ static __init void xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
static void xen_load_sp0(struct tss_struct *tss,
struct thread_struct *thread)
{
- struct multicall_space mcs = xen_mc_entry(0);
+ struct multicall_space mcs;
+
+ mcs = xen_mc_entry(0);
MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
xen_mc_issue(PARAVIRT_LAZY_CPU);
}
@@ -933,18 +947,18 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
return ret;
}
-static const struct pv_info xen_info __initdata = {
+static const struct pv_info xen_info __initconst = {
.paravirt_enabled = 1,
.shared_kernel_pmd = 0,
.name = "Xen",
};
-static const struct pv_init_ops xen_init_ops __initdata = {
+static const struct pv_init_ops xen_init_ops __initconst = {
.patch = xen_patch,
};
-static const struct pv_cpu_ops xen_cpu_ops __initdata = {
+static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.cpuid = xen_cpuid,
.set_debugreg = xen_set_debugreg,
@@ -1004,7 +1018,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
.end_context_switch = xen_end_context_switch,
};
-static const struct pv_apic_ops xen_apic_ops __initdata = {
+static const struct pv_apic_ops xen_apic_ops __initconst = {
#ifdef CONFIG_X86_LOCAL_APIC
.startup_ipi_hook = paravirt_nop,
#endif
@@ -1033,6 +1047,13 @@ static void xen_machine_halt(void)
xen_reboot(SHUTDOWN_poweroff);
}
+static void xen_machine_power_off(void)
+{
+ if (pm_power_off)
+ pm_power_off();
+ xen_reboot(SHUTDOWN_poweroff);
+}
+
static void xen_crash_shutdown(struct pt_regs *regs)
{
xen_reboot(SHUTDOWN_crash);
@@ -1055,10 +1076,10 @@ int xen_panic_handler_init(void)
return 0;
}
-static const struct machine_ops __initdata xen_machine_ops = {
+static const struct machine_ops xen_machine_ops __initconst = {
.restart = xen_restart,
.halt = xen_machine_halt,
- .power_off = xen_machine_halt,
+ .power_off = xen_machine_power_off,
.shutdown = xen_machine_halt,
.crash_shutdown = xen_crash_shutdown,
.emergency_restart = xen_emergency_restart,
@@ -1241,6 +1262,14 @@ asmlinkage void __init xen_start_kernel(void)
if (pci_xen)
x86_init.pci.arch_init = pci_xen_init;
} else {
+ const struct dom0_vga_console_info *info =
+ (void *)((char *)xen_start_info +
+ xen_start_info->console.dom0.info_off);
+
+ xen_init_vga(info, xen_start_info->console.dom0.info_size);
+ xen_start_info->console.domU.mfn = 0;
+ xen_start_info->console.domU.evtchn = 0;
+
/* Make sure ACS will be enabled */
pci_request_acs();
}
@@ -1332,7 +1361,7 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block __cpuinitdata xen_hvm_cpu_notifier = {
+static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = {
.notifier_call = xen_hvm_cpu_notify,
};
@@ -1381,7 +1410,7 @@ bool xen_hvm_need_lapic(void)
}
EXPORT_SYMBOL_GPL(xen_hvm_need_lapic);
-const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = {
+const struct hypervisor_x86 x86_hyper_xen_hvm __refconst = {
.name = "Xen HVM",
.detect = xen_hvm_platform,
.init_platform = xen_hvm_guest_init,
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 6a6fe893964..8bbb465b6f0 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -113,7 +113,7 @@ static void xen_halt(void)
xen_safe_halt();
}
-static const struct pv_irq_ops xen_irq_ops __initdata = {
+static const struct pv_irq_ops xen_irq_ops __initconst = {
.save_fl = PV_CALLEE_SAVE(xen_save_fl),
.restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
.irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index a991b57f91f..f987bde77c4 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -48,6 +48,8 @@
#include <linux/memblock.h>
#include <linux/seq_file.h>
+#include <trace/events/xen.h>
+
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/fixmap.h>
@@ -59,6 +61,7 @@
#include <asm/page.h>
#include <asm/init.h>
#include <asm/pat.h>
+#include <asm/smp.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
@@ -75,67 +78,12 @@
#include "mmu.h"
#include "debugfs.h"
-#define MMU_UPDATE_HISTO 30
-
/*
* Protects atomic reservation decrease/increase against concurrent increases.
* Also protects non-atomic updates of current_pages and balloon lists.
*/
DEFINE_SPINLOCK(xen_reservation_lock);
-#ifdef CONFIG_XEN_DEBUG_FS
-
-static struct {
- u32 pgd_update;
- u32 pgd_update_pinned;
- u32 pgd_update_batched;
-
- u32 pud_update;
- u32 pud_update_pinned;
- u32 pud_update_batched;
-
- u32 pmd_update;
- u32 pmd_update_pinned;
- u32 pmd_update_batched;
-
- u32 pte_update;
- u32 pte_update_pinned;
- u32 pte_update_batched;
-
- u32 mmu_update;
- u32 mmu_update_extended;
- u32 mmu_update_histo[MMU_UPDATE_HISTO];
-
- u32 prot_commit;
- u32 prot_commit_batched;
-
- u32 set_pte_at;
- u32 set_pte_at_batched;
- u32 set_pte_at_pinned;
- u32 set_pte_at_current;
- u32 set_pte_at_kernel;
-} mmu_stats;
-
-static u8 zero_stats;
-
-static inline void check_zero(void)
-{
- if (unlikely(zero_stats)) {
- memset(&mmu_stats, 0, sizeof(mmu_stats));
- zero_stats = 0;
- }
-}
-
-#define ADD_STATS(elem, val) \
- do { check_zero(); mmu_stats.elem += (val); } while(0)
-
-#else /* !CONFIG_XEN_DEBUG_FS */
-
-#define ADD_STATS(elem, val) do { (void)(val); } while(0)
-
-#endif /* CONFIG_XEN_DEBUG_FS */
-
-
/*
* Identity map, in addition to plain kernel map. This needs to be
* large enough to allocate page table pages to allocate the rest.
@@ -243,21 +191,18 @@ static bool xen_page_pinned(void *ptr)
return PagePinned(page);
}
-static bool xen_iomap_pte(pte_t pte)
-{
- return pte_flags(pte) & _PAGE_IOMAP;
-}
-
void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
{
struct multicall_space mcs;
struct mmu_update *u;
+ trace_xen_mmu_set_domain_pte(ptep, pteval, domid);
+
mcs = xen_mc_entry(sizeof(*u));
u = mcs.args;
/* ptep might be kmapped when using 32-bit HIGHPTE */
- u->ptr = arbitrary_virt_to_machine(ptep).maddr;
+ u->ptr = virt_to_machine(ptep).maddr;
u->val = pte_val_ma(pteval);
MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
@@ -266,11 +211,6 @@ void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
}
EXPORT_SYMBOL_GPL(xen_set_domain_pte);
-static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
-{
- xen_set_domain_pte(ptep, pteval, DOMID_IO);
-}
-
static void xen_extend_mmu_update(const struct mmu_update *update)
{
struct multicall_space mcs;
@@ -279,27 +219,35 @@ static void xen_extend_mmu_update(const struct mmu_update *update)
mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
if (mcs.mc != NULL) {
- ADD_STATS(mmu_update_extended, 1);
- ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
-
mcs.mc->args[1]++;
-
- if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
- ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
- else
- ADD_STATS(mmu_update_histo[0], 1);
} else {
- ADD_STATS(mmu_update, 1);
mcs = __xen_mc_entry(sizeof(*u));
MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
- ADD_STATS(mmu_update_histo[1], 1);
}
u = mcs.args;
*u = *update;
}
-void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
+static void xen_extend_mmuext_op(const struct mmuext_op *op)
+{
+ struct multicall_space mcs;
+ struct mmuext_op *u;
+
+ mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u));
+
+ if (mcs.mc != NULL) {
+ mcs.mc->args[1]++;
+ } else {
+ mcs = __xen_mc_entry(sizeof(*u));
+ MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
+ }
+
+ u = mcs.args;
+ *u = *op;
+}
+
+static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
{
struct mmu_update u;
@@ -312,16 +260,14 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
u.val = pmd_val_ma(val);
xen_extend_mmu_update(&u);
- ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
-
xen_mc_issue(PARAVIRT_LAZY_MMU);
preempt_enable();
}
-void xen_set_pmd(pmd_t *ptr, pmd_t val)
+static void xen_set_pmd(pmd_t *ptr, pmd_t val)
{
- ADD_STATS(pmd_update, 1);
+ trace_xen_mmu_set_pmd(ptr, val);
/* If page is not pinned, we can just update the entry
directly */
@@ -330,8 +276,6 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val)
return;
}
- ADD_STATS(pmd_update_pinned, 1);
-
xen_set_pmd_hyper(ptr, val);
}
@@ -344,41 +288,48 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
}
-void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pteval)
+static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
{
- if (xen_iomap_pte(pteval)) {
- xen_set_iomap_pte(ptep, pteval);
- goto out;
- }
+ struct mmu_update u;
- ADD_STATS(set_pte_at, 1);
-// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
- ADD_STATS(set_pte_at_current, mm == current->mm);
- ADD_STATS(set_pte_at_kernel, mm == &init_mm);
+ if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU)
+ return false;
- if (mm == current->mm || mm == &init_mm) {
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
- struct multicall_space mcs;
- mcs = xen_mc_entry(0);
+ xen_mc_batch();
- MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
- ADD_STATS(set_pte_at_batched, 1);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
- goto out;
- } else
- if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
- goto out;
- }
- xen_set_pte(ptep, pteval);
+ u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
+ u.val = pte_val_ma(pteval);
+ xen_extend_mmu_update(&u);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+ return true;
+}
+
+static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
+{
+ if (!xen_batched_set_pte(ptep, pteval))
+ native_set_pte(ptep, pteval);
+}
-out: return;
+static void xen_set_pte(pte_t *ptep, pte_t pteval)
+{
+ trace_xen_mmu_set_pte(ptep, pteval);
+ __xen_set_pte(ptep, pteval);
+}
+
+static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pteval)
+{
+ trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval);
+ __xen_set_pte(ptep, pteval);
}
pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
/* Just return the pte as-is. We preserve the bits on commit */
+ trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep);
return *ptep;
}
@@ -387,15 +338,13 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
{
struct mmu_update u;
+ trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte);
xen_mc_batch();
- u.ptr = arbitrary_virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
+ u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
u.val = pte_val_ma(pte);
xen_extend_mmu_update(&u);
- ADD_STATS(prot_commit, 1);
- ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
-
xen_mc_issue(PARAVIRT_LAZY_MMU);
}
@@ -463,7 +412,7 @@ static pteval_t iomap_pte(pteval_t val)
return val;
}
-pteval_t xen_pte_val(pte_t pte)
+static pteval_t xen_pte_val(pte_t pte)
{
pteval_t pteval = pte.pte;
@@ -480,7 +429,7 @@ pteval_t xen_pte_val(pte_t pte)
}
PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
-pgdval_t xen_pgd_val(pgd_t pgd)
+static pgdval_t xen_pgd_val(pgd_t pgd)
{
return pte_mfn_to_pfn(pgd.pgd);
}
@@ -511,7 +460,7 @@ void xen_set_pat(u64 pat)
WARN_ON(pat != 0x0007010600070106ull);
}
-pte_t xen_make_pte(pteval_t pte)
+static pte_t xen_make_pte(pteval_t pte)
{
phys_addr_t addr = (pte & PTE_PFN_MASK);
@@ -581,20 +530,20 @@ pte_t xen_make_pte_debug(pteval_t pte)
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_debug);
#endif
-pgd_t xen_make_pgd(pgdval_t pgd)
+static pgd_t xen_make_pgd(pgdval_t pgd)
{
pgd = pte_pfn_to_mfn(pgd);
return native_make_pgd(pgd);
}
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
-pmdval_t xen_pmd_val(pmd_t pmd)
+static pmdval_t xen_pmd_val(pmd_t pmd)
{
return pte_mfn_to_pfn(pmd.pmd);
}
PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
-void xen_set_pud_hyper(pud_t *ptr, pud_t val)
+static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
{
struct mmu_update u;
@@ -607,16 +556,14 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
u.val = pud_val_ma(val);
xen_extend_mmu_update(&u);
- ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
-
xen_mc_issue(PARAVIRT_LAZY_MMU);
preempt_enable();
}
-void xen_set_pud(pud_t *ptr, pud_t val)
+static void xen_set_pud(pud_t *ptr, pud_t val)
{
- ADD_STATS(pud_update, 1);
+ trace_xen_mmu_set_pud(ptr, val);
/* If page is not pinned, we can just update the entry
directly */
@@ -625,56 +572,31 @@ void xen_set_pud(pud_t *ptr, pud_t val)
return;
}
- ADD_STATS(pud_update_pinned, 1);
-
xen_set_pud_hyper(ptr, val);
}
-void xen_set_pte(pte_t *ptep, pte_t pte)
-{
- if (xen_iomap_pte(pte)) {
- xen_set_iomap_pte(ptep, pte);
- return;
- }
-
- ADD_STATS(pte_update, 1);
-// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
- ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
-
-#ifdef CONFIG_X86_PAE
- ptep->pte_high = pte.pte_high;
- smp_wmb();
- ptep->pte_low = pte.pte_low;
-#else
- *ptep = pte;
-#endif
-}
-
#ifdef CONFIG_X86_PAE
-void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
+static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
{
- if (xen_iomap_pte(pte)) {
- xen_set_iomap_pte(ptep, pte);
- return;
- }
-
+ trace_xen_mmu_set_pte_atomic(ptep, pte);
set_64bit((u64 *)ptep, native_pte_val(pte));
}
-void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
- ptep->pte_low = 0;
- smp_wmb(); /* make sure low gets written first */
- ptep->pte_high = 0;
+ trace_xen_mmu_pte_clear(mm, addr, ptep);
+ if (!xen_batched_set_pte(ptep, native_make_pte(0)))
+ native_pte_clear(mm, addr, ptep);
}
-void xen_pmd_clear(pmd_t *pmdp)
+static void xen_pmd_clear(pmd_t *pmdp)
{
+ trace_xen_mmu_pmd_clear(pmdp);
set_pmd(pmdp, __pmd(0));
}
#endif /* CONFIG_X86_PAE */
-pmd_t xen_make_pmd(pmdval_t pmd)
+static pmd_t xen_make_pmd(pmdval_t pmd)
{
pmd = pte_pfn_to_mfn(pmd);
return native_make_pmd(pmd);
@@ -682,13 +604,13 @@ pmd_t xen_make_pmd(pmdval_t pmd)
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
#if PAGETABLE_LEVELS == 4
-pudval_t xen_pud_val(pud_t pud)
+static pudval_t xen_pud_val(pud_t pud)
{
return pte_mfn_to_pfn(pud.pud);
}
PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
-pud_t xen_make_pud(pudval_t pud)
+static pud_t xen_make_pud(pudval_t pud)
{
pud = pte_pfn_to_mfn(pud);
@@ -696,7 +618,7 @@ pud_t xen_make_pud(pudval_t pud)
}
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
-pgd_t *xen_get_user_pgd(pgd_t *pgd)
+static pgd_t *xen_get_user_pgd(pgd_t *pgd)
{
pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
unsigned offset = pgd - pgd_page;
@@ -728,7 +650,7 @@ static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
* 2. It is always pinned
* 3. It has no user pagetable attached to it
*/
-void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
+static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
{
preempt_disable();
@@ -741,11 +663,11 @@ void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
preempt_enable();
}
-void xen_set_pgd(pgd_t *ptr, pgd_t val)
+static void xen_set_pgd(pgd_t *ptr, pgd_t val)
{
pgd_t *user_ptr = xen_get_user_pgd(ptr);
- ADD_STATS(pgd_update, 1);
+ trace_xen_mmu_set_pgd(ptr, user_ptr, val);
/* If page is not pinned, we can just update the entry
directly */
@@ -758,9 +680,6 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)
return;
}
- ADD_STATS(pgd_update_pinned, 1);
- ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
-
/* If it's pinned, then we can at least batch the kernel and
user updates together. */
xen_mc_batch();
@@ -909,14 +828,12 @@ static void xen_pte_unlock(void *v)
static void xen_do_pin(unsigned level, unsigned long pfn)
{
- struct mmuext_op *op;
- struct multicall_space mcs;
+ struct mmuext_op op;
- mcs = __xen_mc_entry(sizeof(*op));
- op = mcs.args;
- op->cmd = level;
- op->arg1.mfn = pfn_to_mfn(pfn);
- MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+ op.cmd = level;
+ op.arg1.mfn = pfn_to_mfn(pfn);
+
+ xen_extend_mmuext_op(&op);
}
static int xen_pin_page(struct mm_struct *mm, struct page *page,
@@ -984,6 +901,8 @@ static int xen_pin_page(struct mm_struct *mm, struct page *page,
read-only, and can be pinned. */
static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
{
+ trace_xen_mmu_pgd_pin(mm, pgd);
+
xen_mc_batch();
if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
@@ -1054,7 +973,7 @@ void xen_mm_pin_all(void)
* that's before we have page structures to store the bits. So do all
* the book-keeping now.
*/
-static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
+static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
enum pt_level level)
{
SetPagePinned(page);
@@ -1109,6 +1028,8 @@ static int xen_unpin_page(struct mm_struct *mm, struct page *page,
/* Release a pagetables pages back as normal RW */
static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
{
+ trace_xen_mmu_pgd_unpin(mm, pgd);
+
xen_mc_batch();
xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
@@ -1162,14 +1083,14 @@ void xen_mm_unpin_all(void)
spin_unlock(&pgd_lock);
}
-void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
+static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
{
spin_lock(&next->page_table_lock);
xen_pgd_pin(next);
spin_unlock(&next->page_table_lock);
}
-void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
{
spin_lock(&mm->page_table_lock);
xen_pgd_pin(mm);
@@ -1187,7 +1108,7 @@ static void drop_other_mm_ref(void *info)
active_mm = percpu_read(cpu_tlbstate.active_mm);
- if (active_mm == mm)
+ if (active_mm == mm && percpu_read(cpu_tlbstate.state) != TLBSTATE_OK)
leave_mm(smp_processor_id());
/* If this cpu still has a stale cr3 reference, then make sure
@@ -1256,7 +1177,7 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
* pagetable because of lazy tlb flushing. This means we need need to
* switch all CPUs off this pagetable before we can unpin it.
*/
-void xen_exit_mmap(struct mm_struct *mm)
+static void xen_exit_mmap(struct mm_struct *mm)
{
get_cpu(); /* make sure we don't move around */
xen_drop_mm_ref(mm);
@@ -1271,13 +1192,27 @@ void xen_exit_mmap(struct mm_struct *mm)
spin_unlock(&mm->page_table_lock);
}
-static __init void xen_pagetable_setup_start(pgd_t *base)
+static void __init xen_pagetable_setup_start(pgd_t *base)
{
}
+static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
+{
+ /* reserve the range used */
+ native_pagetable_reserve(start, end);
+
+ /* set as RW the rest */
+ printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end,
+ PFN_PHYS(pgt_buf_top));
+ while (end < PFN_PHYS(pgt_buf_top)) {
+ make_lowmem_page_readwrite(__va(end));
+ end += PAGE_SIZE;
+ }
+}
+
static void xen_post_allocator_init(void);
-static __init void xen_pagetable_setup_done(pgd_t *base)
+static void __init xen_pagetable_setup_done(pgd_t *base)
{
xen_setup_shared_info();
xen_post_allocator_init();
@@ -1303,6 +1238,8 @@ static void xen_flush_tlb(void)
struct mmuext_op *op;
struct multicall_space mcs;
+ trace_xen_mmu_flush_tlb(0);
+
preempt_disable();
mcs = xen_mc_entry(sizeof(*op));
@@ -1321,6 +1258,8 @@ static void xen_flush_tlb_single(unsigned long addr)
struct mmuext_op *op;
struct multicall_space mcs;
+ trace_xen_mmu_flush_tlb_single(addr);
+
preempt_disable();
mcs = xen_mc_entry(sizeof(*op));
@@ -1339,10 +1278,16 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
{
struct {
struct mmuext_op op;
+#ifdef CONFIG_SMP
+ DECLARE_BITMAP(mask, num_processors);
+#else
DECLARE_BITMAP(mask, NR_CPUS);
+#endif
} *args;
struct multicall_space mcs;
+ trace_xen_mmu_flush_tlb_others(cpus, mm, va);
+
if (cpumask_empty(cpus))
return; /* nothing to do */
@@ -1378,10 +1323,11 @@ static void set_current_cr3(void *v)
static void __xen_write_cr3(bool kernel, unsigned long cr3)
{
- struct mmuext_op *op;
- struct multicall_space mcs;
+ struct mmuext_op op;
unsigned long mfn;
+ trace_xen_mmu_write_cr3(kernel, cr3);
+
if (cr3)
mfn = pfn_to_mfn(PFN_DOWN(cr3));
else
@@ -1389,13 +1335,10 @@ static void __xen_write_cr3(bool kernel, unsigned long cr3)
WARN_ON(mfn == 0 && kernel);
- mcs = __xen_mc_entry(sizeof(*op));
+ op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
+ op.arg1.mfn = mfn;
- op = mcs.args;
- op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
- op->arg1.mfn = mfn;
-
- MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+ xen_extend_mmuext_op(&op);
if (kernel) {
percpu_write(xen_cr3, cr3);
@@ -1473,16 +1416,20 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
#endif
}
-static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
-{
- unsigned long pfn = pte_pfn(pte);
-
#ifdef CONFIG_X86_32
+static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
+{
/* If there's an existing pte, then don't allow _PAGE_RW to be set */
if (pte_val_ma(*ptep) & _PAGE_PRESENT)
pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
pte_val_ma(pte));
-#endif
+
+ return pte;
+}
+#else /* CONFIG_X86_64 */
+static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
+{
+ unsigned long pfn = pte_pfn(pte);
/*
* If the new pfn is within the range of the newly allocated
@@ -1491,16 +1438,17 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
* it is RO.
*/
if (((!is_early_ioremap_ptep(ptep) &&
- pfn >= pgt_buf_start && pfn < pgt_buf_end)) ||
+ pfn >= pgt_buf_start && pfn < pgt_buf_top)) ||
(is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1)))
pte = pte_wrprotect(pte);
return pte;
}
+#endif /* CONFIG_X86_64 */
/* Init-time set_pte while constructing initial pagetables, which
doesn't allow RO pagetable pages to be remapped RW */
-static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
+static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
{
pte = mask_rw_pte(ptep, pte);
@@ -1518,7 +1466,7 @@ static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
/* Early in boot, while setting up the initial pagetable, assume
everything is pinned. */
-static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
+static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
{
#ifdef CONFIG_FLATMEM
BUG_ON(mem_map); /* should only be used early */
@@ -1528,7 +1476,7 @@ static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
}
/* Used for pmd and pud */
-static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
+static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
{
#ifdef CONFIG_FLATMEM
BUG_ON(mem_map); /* should only be used early */
@@ -1538,30 +1486,63 @@ static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
/* Early release_pte assumes that all pts are pinned, since there's
only init_mm and anything attached to that is pinned. */
-static __init void xen_release_pte_init(unsigned long pfn)
+static void __init xen_release_pte_init(unsigned long pfn)
{
pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
}
-static __init void xen_release_pmd_init(unsigned long pfn)
+static void __init xen_release_pmd_init(unsigned long pfn)
{
make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
}
+static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+{
+ struct multicall_space mcs;
+ struct mmuext_op *op;
+
+ mcs = __xen_mc_entry(sizeof(*op));
+ op = mcs.args;
+ op->cmd = cmd;
+ op->arg1.mfn = pfn_to_mfn(pfn);
+
+ MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
+}
+
+static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot)
+{
+ struct multicall_space mcs;
+ unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT);
+
+ mcs = __xen_mc_entry(0);
+ MULTI_update_va_mapping(mcs.mc, (unsigned long)addr,
+ pfn_pte(pfn, prot), 0);
+}
+
/* This needs to make sure the new pte page is pinned iff its being
attached to a pinned pagetable. */
-static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
+static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
+ unsigned level)
{
- struct page *page = pfn_to_page(pfn);
+ bool pinned = PagePinned(virt_to_page(mm->pgd));
+
+ trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned);
+
+ if (pinned) {
+ struct page *page = pfn_to_page(pfn);
- if (PagePinned(virt_to_page(mm->pgd))) {
SetPagePinned(page);
if (!PageHighMem(page)) {
- make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
+ xen_mc_batch();
+
+ __set_pfn_prot(pfn, PAGE_KERNEL_RO);
+
if (level == PT_PTE && USE_SPLIT_PTLOCKS)
- pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
+ __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
} else {
/* make sure there are no stray mappings of
this page */
@@ -1581,15 +1562,23 @@ static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
}
/* This should never happen until we're OK to use struct page */
-static void xen_release_ptpage(unsigned long pfn, unsigned level)
+static inline void xen_release_ptpage(unsigned long pfn, unsigned level)
{
struct page *page = pfn_to_page(pfn);
+ bool pinned = PagePinned(page);
+
+ trace_xen_mmu_release_ptpage(pfn, level, pinned);
- if (PagePinned(page)) {
+ if (pinned) {
if (!PageHighMem(page)) {
+ xen_mc_batch();
+
if (level == PT_PTE && USE_SPLIT_PTLOCKS)
- pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
- make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+ __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
+
+ __set_pfn_prot(pfn, PAGE_KERNEL);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
}
ClearPagePinned(page);
}
@@ -1670,7 +1659,7 @@ static void set_page_prot(void *addr, pgprot_t prot)
BUG();
}
-static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
+static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
{
unsigned pmdidx, pteidx;
unsigned ident_pte;
@@ -1702,6 +1691,11 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
pte_t pte;
+#ifdef CONFIG_X86_32
+ if (pfn > max_pfn_mapped)
+ max_pfn_mapped = pfn;
+#endif
+
if (!pte_none(pte_page[pteidx]))
continue;
@@ -1753,7 +1747,7 @@ static void convert_pfn_mfn(void *v)
* of the physical mapping once some sort of allocator has been set
* up.
*/
-__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
+pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
unsigned long max_pfn)
{
pud_t *l3;
@@ -1824,7 +1818,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
-static __init void xen_write_cr3_init(unsigned long cr3)
+static void __init xen_write_cr3_init(unsigned long cr3)
{
unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
@@ -1861,7 +1855,7 @@ static __init void xen_write_cr3_init(unsigned long cr3)
pv_mmu_ops.write_cr3 = &xen_write_cr3;
}
-__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
+pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
unsigned long max_pfn)
{
pmd_t *kernel_pmd;
@@ -1869,7 +1863,9 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
initial_kernel_pmd =
extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
- max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
+ max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
+ xen_start_info->nr_pt_frames * PAGE_SIZE +
+ 512*1024);
kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
@@ -1967,7 +1963,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
#endif
}
-__init void xen_ident_map_ISA(void)
+void __init xen_ident_map_ISA(void)
{
unsigned long pa;
@@ -1990,7 +1986,7 @@ __init void xen_ident_map_ISA(void)
xen_flush_tlb();
}
-static __init void xen_post_allocator_init(void)
+static void __init xen_post_allocator_init(void)
{
#ifdef CONFIG_XEN_DEBUG
pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug);
@@ -2027,7 +2023,7 @@ static void xen_leave_lazy_mmu(void)
preempt_enable();
}
-static const struct pv_mmu_ops xen_mmu_ops __initdata = {
+static const struct pv_mmu_ops xen_mmu_ops __initconst = {
.read_cr2 = xen_read_cr2,
.write_cr2 = xen_write_cr2,
@@ -2100,6 +2096,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
void __init xen_init_mmu_ops(void)
{
+ x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve;
x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
pv_mmu_ops = xen_mmu_ops;
@@ -2351,7 +2348,7 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
struct remap_data *rmd = data;
pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot));
- rmd->mmu_update->ptr = arbitrary_virt_to_machine(ptep).maddr;
+ rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
rmd->mmu_update->val = pte_val_ma(pte);
rmd->mmu_update++;
@@ -2405,7 +2402,6 @@ out:
EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
#ifdef CONFIG_XEN_DEBUG_FS
-
static int p2m_dump_open(struct inode *inode, struct file *filp)
{
return single_open(filp, p2m_dump_show, NULL);
@@ -2417,65 +2413,4 @@ static const struct file_operations p2m_dump_fops = {
.llseek = seq_lseek,
.release = single_release,
};
-
-static struct dentry *d_mmu_debug;
-
-static int __init xen_mmu_debugfs(void)
-{
- struct dentry *d_xen = xen_init_debugfs();
-
- if (d_xen == NULL)
- return -ENOMEM;
-
- d_mmu_debug = debugfs_create_dir("mmu", d_xen);
-
- debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
-
- debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
- debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
- &mmu_stats.pgd_update_pinned);
- debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
- &mmu_stats.pgd_update_pinned);
-
- debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
- debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
- &mmu_stats.pud_update_pinned);
- debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
- &mmu_stats.pud_update_pinned);
-
- debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
- debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
- &mmu_stats.pmd_update_pinned);
- debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
- &mmu_stats.pmd_update_pinned);
-
- debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
-// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
-// &mmu_stats.pte_update_pinned);
- debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
- &mmu_stats.pte_update_pinned);
-
- debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
- debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
- &mmu_stats.mmu_update_extended);
- xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
- mmu_stats.mmu_update_histo, 20);
-
- debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
- debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
- &mmu_stats.set_pte_at_batched);
- debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
- &mmu_stats.set_pte_at_current);
- debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
- &mmu_stats.set_pte_at_kernel);
-
- debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
- debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
- &mmu_stats.prot_commit_batched);
-
- debugfs_create_file("p2m", 0600, d_mmu_debug, NULL, &p2m_dump_fops);
- return 0;
-}
-fs_initcall(xen_mmu_debugfs);
-
-#endif /* CONFIG_XEN_DEBUG_FS */
+#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 537bb9aab77..73809bb951b 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -15,43 +15,6 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
-
-void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
-void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
-void xen_exit_mmap(struct mm_struct *mm);
-
-pteval_t xen_pte_val(pte_t);
-pmdval_t xen_pmd_val(pmd_t);
-pgdval_t xen_pgd_val(pgd_t);
-
-pte_t xen_make_pte(pteval_t);
-pmd_t xen_make_pmd(pmdval_t);
-pgd_t xen_make_pgd(pgdval_t);
-
-void xen_set_pte(pte_t *ptep, pte_t pteval);
-void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pteval);
-
-#ifdef CONFIG_X86_PAE
-void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
-void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
-void xen_pmd_clear(pmd_t *pmdp);
-#endif /* CONFIG_X86_PAE */
-
-void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
-void xen_set_pud(pud_t *ptr, pud_t val);
-void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval);
-void xen_set_pud_hyper(pud_t *ptr, pud_t val);
-
-#if PAGETABLE_LEVELS == 4
-pudval_t xen_pud_val(pud_t pud);
-pud_t xen_make_pud(pudval_t pudval);
-void xen_set_pgd(pgd_t *pgdp, pgd_t pgd);
-void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd);
-#endif
-
-pgd_t *xen_get_user_pgd(pgd_t *pgd);
-
pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte);
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 8bff7e7c290..0d82003e76a 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -30,12 +30,13 @@
#define MC_BATCH 32
-#define MC_DEBUG 1
+#define MC_DEBUG 0
#define MC_ARGS (MC_BATCH * 16)
struct mc_buffer {
+ unsigned mcidx, argidx, cbidx;
struct multicall_entry entries[MC_BATCH];
#if MC_DEBUG
struct multicall_entry debug[MC_BATCH];
@@ -46,85 +47,15 @@ struct mc_buffer {
void (*fn)(void *);
void *data;
} callbacks[MC_BATCH];
- unsigned mcidx, argidx, cbidx;
};
static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
-/* flush reasons 0- slots, 1- args, 2- callbacks */
-enum flush_reasons
-{
- FL_SLOTS,
- FL_ARGS,
- FL_CALLBACKS,
-
- FL_N_REASONS
-};
-
-#ifdef CONFIG_XEN_DEBUG_FS
-#define NHYPERCALLS 40 /* not really */
-
-static struct {
- unsigned histo[MC_BATCH+1];
-
- unsigned issued;
- unsigned arg_total;
- unsigned hypercalls;
- unsigned histo_hypercalls[NHYPERCALLS];
-
- unsigned flush[FL_N_REASONS];
-} mc_stats;
-
-static u8 zero_stats;
-
-static inline void check_zero(void)
-{
- if (unlikely(zero_stats)) {
- memset(&mc_stats, 0, sizeof(mc_stats));
- zero_stats = 0;
- }
-}
-
-static void mc_add_stats(const struct mc_buffer *mc)
-{
- int i;
-
- check_zero();
-
- mc_stats.issued++;
- mc_stats.hypercalls += mc->mcidx;
- mc_stats.arg_total += mc->argidx;
-
- mc_stats.histo[mc->mcidx]++;
- for(i = 0; i < mc->mcidx; i++) {
- unsigned op = mc->entries[i].op;
- if (op < NHYPERCALLS)
- mc_stats.histo_hypercalls[op]++;
- }
-}
-
-static void mc_stats_flush(enum flush_reasons idx)
-{
- check_zero();
-
- mc_stats.flush[idx]++;
-}
-
-#else /* !CONFIG_XEN_DEBUG_FS */
-
-static inline void mc_add_stats(const struct mc_buffer *mc)
-{
-}
-
-static inline void mc_stats_flush(enum flush_reasons idx)
-{
-}
-#endif /* CONFIG_XEN_DEBUG_FS */
-
void xen_mc_flush(void)
{
struct mc_buffer *b = &__get_cpu_var(mc_buffer);
+ struct multicall_entry *mc;
int ret = 0;
unsigned long flags;
int i;
@@ -135,9 +66,26 @@ void xen_mc_flush(void)
something in the middle */
local_irq_save(flags);
- mc_add_stats(b);
+ trace_xen_mc_flush(b->mcidx, b->argidx, b->cbidx);
+
+ switch (b->mcidx) {
+ case 0:
+ /* no-op */
+ BUG_ON(b->argidx != 0);
+ break;
+
+ case 1:
+ /* Singleton multicall - bypass multicall machinery
+ and just do the call directly. */
+ mc = &b->entries[0];
+
+ mc->result = privcmd_call(mc->op,
+ mc->args[0], mc->args[1], mc->args[2],
+ mc->args[3], mc->args[4]);
+ ret = mc->result < 0;
+ break;
- if (b->mcidx) {
+ default:
#if MC_DEBUG
memcpy(b->debug, b->entries,
b->mcidx * sizeof(struct multicall_entry));
@@ -164,11 +112,10 @@ void xen_mc_flush(void)
}
}
#endif
+ }
- b->mcidx = 0;
- b->argidx = 0;
- } else
- BUG_ON(b->argidx != 0);
+ b->mcidx = 0;
+ b->argidx = 0;
for (i = 0; i < b->cbidx; i++) {
struct callback *cb = &b->callbacks[i];
@@ -188,25 +135,28 @@ struct multicall_space __xen_mc_entry(size_t args)
struct multicall_space ret;
unsigned argidx = roundup(b->argidx, sizeof(u64));
+ trace_xen_mc_entry_alloc(args);
+
BUG_ON(preemptible());
- BUG_ON(b->argidx > MC_ARGS);
+ BUG_ON(b->argidx >= MC_ARGS);
- if (b->mcidx == MC_BATCH ||
- (argidx + args) > MC_ARGS) {
- mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS);
+ if (unlikely(b->mcidx == MC_BATCH ||
+ (argidx + args) >= MC_ARGS)) {
+ trace_xen_mc_flush_reason((b->mcidx == MC_BATCH) ?
+ XEN_MC_FL_BATCH : XEN_MC_FL_ARGS);
xen_mc_flush();
argidx = roundup(b->argidx, sizeof(u64));
}
ret.mc = &b->entries[b->mcidx];
-#ifdef MC_DEBUG
+#if MC_DEBUG
b->caller[b->mcidx] = __builtin_return_address(0);
#endif
b->mcidx++;
ret.args = &b->args[argidx];
b->argidx = argidx + args;
- BUG_ON(b->argidx > MC_ARGS);
+ BUG_ON(b->argidx >= MC_ARGS);
return ret;
}
@@ -216,22 +166,27 @@ struct multicall_space xen_mc_extend_args(unsigned long op, size_t size)
struct multicall_space ret = { NULL, NULL };
BUG_ON(preemptible());
- BUG_ON(b->argidx > MC_ARGS);
+ BUG_ON(b->argidx >= MC_ARGS);
- if (b->mcidx == 0)
- return ret;
-
- if (b->entries[b->mcidx - 1].op != op)
- return ret;
+ if (unlikely(b->mcidx == 0 ||
+ b->entries[b->mcidx - 1].op != op)) {
+ trace_xen_mc_extend_args(op, size, XEN_MC_XE_BAD_OP);
+ goto out;
+ }
- if ((b->argidx + size) > MC_ARGS)
- return ret;
+ if (unlikely((b->argidx + size) >= MC_ARGS)) {
+ trace_xen_mc_extend_args(op, size, XEN_MC_XE_NO_SPACE);
+ goto out;
+ }
ret.mc = &b->entries[b->mcidx - 1];
ret.args = &b->args[b->argidx];
b->argidx += size;
- BUG_ON(b->argidx > MC_ARGS);
+ BUG_ON(b->argidx >= MC_ARGS);
+
+ trace_xen_mc_extend_args(op, size, XEN_MC_XE_OK);
+out:
return ret;
}
@@ -241,43 +196,13 @@ void xen_mc_callback(void (*fn)(void *), void *data)
struct callback *cb;
if (b->cbidx == MC_BATCH) {
- mc_stats_flush(FL_CALLBACKS);
+ trace_xen_mc_flush_reason(XEN_MC_FL_CALLBACK);
xen_mc_flush();
}
+ trace_xen_mc_callback(fn, data);
+
cb = &b->callbacks[b->cbidx++];
cb->fn = fn;
cb->data = data;
}
-
-#ifdef CONFIG_XEN_DEBUG_FS
-
-static struct dentry *d_mc_debug;
-
-static int __init xen_mc_debugfs(void)
-{
- struct dentry *d_xen = xen_init_debugfs();
-
- if (d_xen == NULL)
- return -ENOMEM;
-
- d_mc_debug = debugfs_create_dir("multicalls", d_xen);
-
- debugfs_create_u8("zero_stats", 0644, d_mc_debug, &zero_stats);
-
- debugfs_create_u32("batches", 0444, d_mc_debug, &mc_stats.issued);
- debugfs_create_u32("hypercalls", 0444, d_mc_debug, &mc_stats.hypercalls);
- debugfs_create_u32("arg_total", 0444, d_mc_debug, &mc_stats.arg_total);
-
- xen_debugfs_create_u32_array("batch_histo", 0444, d_mc_debug,
- mc_stats.histo, MC_BATCH);
- xen_debugfs_create_u32_array("hypercall_histo", 0444, d_mc_debug,
- mc_stats.histo_hypercalls, NHYPERCALLS);
- xen_debugfs_create_u32_array("flush_reasons", 0444, d_mc_debug,
- mc_stats.flush, FL_N_REASONS);
-
- return 0;
-}
-fs_initcall(xen_mc_debugfs);
-
-#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index 4ec8035e321..dee79b78a90 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -1,6 +1,8 @@
#ifndef _XEN_MULTICALLS_H
#define _XEN_MULTICALLS_H
+#include <trace/events/xen.h>
+
#include "xen-ops.h"
/* Multicalls */
@@ -20,8 +22,10 @@ DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags);
static inline void xen_mc_batch(void)
{
unsigned long flags;
+
/* need to disable interrupts until this entry is complete */
local_irq_save(flags);
+ trace_xen_mc_batch(paravirt_get_lazy_mode());
__this_cpu_write(xen_mc_irq_flags, flags);
}
@@ -37,6 +41,8 @@ void xen_mc_flush(void);
/* Issue a multicall if we're not in a lazy mode */
static inline void xen_mc_issue(unsigned mode)
{
+ trace_xen_mc_issue(mode);
+
if ((paravirt_get_lazy_mode() & mode) == 0)
xen_mc_flush();
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 141eb0de8b0..58efeb9d544 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -522,11 +522,20 @@ static bool __init __early_alloc_p2m(unsigned long pfn)
/* Boundary cross-over for the edges: */
if (idx) {
unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ unsigned long *mid_mfn_p;
p2m_init(p2m);
p2m_top[topidx][mididx] = p2m;
+ /* For save/restore we need to MFN of the P2M saved */
+
+ mid_mfn_p = p2m_top_mfn_p[topidx];
+ WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing),
+ "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n",
+ topidx, mididx);
+ mid_mfn_p[mididx] = virt_to_mfn(p2m);
+
}
return idx != 0;
}
@@ -549,12 +558,29 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s,
pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE)
{
unsigned topidx = p2m_top_index(pfn);
- if (p2m_top[topidx] == p2m_mid_missing) {
- unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ unsigned long *mid_mfn_p;
+ unsigned long **mid;
+
+ mid = p2m_top[topidx];
+ mid_mfn_p = p2m_top_mfn_p[topidx];
+ if (mid == p2m_mid_missing) {
+ mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_mid_init(mid);
p2m_top[topidx] = mid;
+
+ BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
+ }
+ /* And the save/restore P2M tables.. */
+ if (mid_mfn_p == p2m_mid_missing_mfn) {
+ mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_mfn_init(mid_mfn_p);
+
+ p2m_top_mfn_p[topidx] = mid_mfn_p;
+ p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
+ /* Note: we don't set mid_mfn_p[midix] here,
+ * look in __early_alloc_p2m */
}
}
@@ -650,7 +676,7 @@ static unsigned long mfn_hash(unsigned long mfn)
}
/* Add an MFN override for a particular page */
-int m2p_add_override(unsigned long mfn, struct page *page)
+int m2p_add_override(unsigned long mfn, struct page *page, bool clear_pte)
{
unsigned long flags;
unsigned long pfn;
@@ -662,7 +688,6 @@ int m2p_add_override(unsigned long mfn, struct page *page)
if (!PageHighMem(page)) {
address = (unsigned long)__va(pfn << PAGE_SHIFT);
ptep = lookup_address(address, &level);
-
if (WARN(ptep == NULL || level != PG_LEVEL_4K,
"m2p_add_override: pfn %lx not mapped", pfn))
return -EINVAL;
@@ -674,18 +699,17 @@ int m2p_add_override(unsigned long mfn, struct page *page)
if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn))))
return -ENOMEM;
- if (!PageHighMem(page))
+ if (clear_pte && !PageHighMem(page))
/* Just zap old mapping for now */
pte_clear(&init_mm, address, ptep);
-
spin_lock_irqsave(&m2p_override_lock, flags);
list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]);
spin_unlock_irqrestore(&m2p_override_lock, flags);
return 0;
}
-
-int m2p_remove_override(struct page *page)
+EXPORT_SYMBOL_GPL(m2p_add_override);
+int m2p_remove_override(struct page *page, bool clear_pte)
{
unsigned long flags;
unsigned long mfn;
@@ -713,7 +737,7 @@ int m2p_remove_override(struct page *page)
spin_unlock_irqrestore(&m2p_override_lock, flags);
set_phys_to_machine(pfn, page->index);
- if (!PageHighMem(page))
+ if (clear_pte && !PageHighMem(page))
set_pte_at(&init_mm, address, ptep,
pfn_pte(pfn, PAGE_KERNEL));
/* No tlb flush necessary because the caller already
@@ -721,6 +745,7 @@ int m2p_remove_override(struct page *page)
return 0;
}
+EXPORT_SYMBOL_GPL(m2p_remove_override);
struct page *m2p_find_override(unsigned long mfn)
{
diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
index bfd0632fe65..b480d4207a4 100644
--- a/arch/x86/xen/pci-swiotlb-xen.c
+++ b/arch/x86/xen/pci-swiotlb-xen.c
@@ -36,7 +36,7 @@ int __init pci_xen_swiotlb_detect(void)
/* If running as PV guest, either iommu=soft, or swiotlb=force will
* activate this IOMMU. If running as PV privileged, activate it
- * irregardlesss.
+ * irregardless.
*/
if ((xen_initial_domain() || swiotlb || swiotlb_force) &&
(xen_pv_domain()))
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
index 25c52f94a27..ffcf2615640 100644
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -35,7 +35,7 @@ EXPORT_SYMBOL_GPL(xen_platform_pci_unplug);
#ifdef CONFIG_XEN_PVHVM
static int xen_emul_unplug;
-static int __init check_platform_magic(void)
+static int check_platform_magic(void)
{
short magic;
char protocol;
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index fa0269a9937..60aeeb56948 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -50,7 +50,7 @@ phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
*/
#define EXTRA_MEM_RATIO (10)
-static __init void xen_add_extra_mem(unsigned long pages)
+static void __init xen_add_extra_mem(unsigned long pages)
{
unsigned long pfn;
@@ -166,7 +166,7 @@ static unsigned long __init xen_set_identity(const struct e820entry *list,
if (last > end)
continue;
- if (entry->type == E820_RAM) {
+ if ((entry->type == E820_RAM) || (entry->type == E820_UNUSABLE)) {
if (start > start_pci)
identity += set_phys_range_identity(
PFN_UP(start_pci), PFN_DOWN(start));
@@ -262,6 +262,12 @@ char * __init xen_memory_setup(void)
if (map[i].size > 0)
e820_add_region(map[i].addr, map[i].size, map[i].type);
}
+ /* Align the balloon area so that max_low_pfn does not get set
+ * to be at the _end_ of the PCI gap at the far end (fee01000).
+ * Note that xen_extra_mem_start gets set in the loop above to be
+ * past the last E820 region. */
+ if (xen_initial_domain() && (xen_extra_mem_start < (1ULL<<32)))
+ xen_extra_mem_start = (1ULL<<32);
/*
* In domU, the ISA region is normal, usable memory, but we
@@ -336,7 +342,7 @@ static void __init fiddle_vdso(void)
#endif
}
-static __cpuinit int register_callback(unsigned type, const void *func)
+static int __cpuinit register_callback(unsigned type, const void *func)
{
struct callback_register callback = {
.type = type,
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 30612441ed9..b4533a86d7e 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -46,18 +46,17 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
/*
- * Reschedule call back. Nothing to do,
- * all the work is done automatically when
- * we return from the interrupt.
+ * Reschedule call back.
*/
static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
{
inc_irq_stat(irq_resched_count);
+ scheduler_ipi();
return IRQ_HANDLED;
}
-static __cpuinit void cpu_bringup(void)
+static void __cpuinit cpu_bringup(void)
{
int cpu = smp_processor_id();
@@ -85,7 +84,7 @@ static __cpuinit void cpu_bringup(void)
wmb(); /* make sure everything is out */
}
-static __cpuinit void cpu_bringup_and_idle(void)
+static void __cpuinit cpu_bringup_and_idle(void)
{
cpu_bringup();
cpu_idle();
@@ -206,11 +205,18 @@ static void __init xen_smp_prepare_boot_cpu(void)
static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
{
unsigned cpu;
+ unsigned int i;
xen_init_lock_cpu(0);
smp_store_cpu_info(0);
cpu_data(0).x86_max_cores = 1;
+
+ for_each_possible_cpu(i) {
+ zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
+ zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
+ zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
+ }
set_cpu_sibling_map(0);
if (xen_smp_intr_init(0))
@@ -242,7 +248,7 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
}
}
-static __cpuinit int
+static int __cpuinit
cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
{
struct vcpu_guest_context *ctxt;
@@ -486,7 +492,7 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
return IRQ_HANDLED;
}
-static const struct smp_ops xen_smp_ops __initdata = {
+static const struct smp_ops xen_smp_ops __initconst = {
.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
.smp_prepare_cpus = xen_smp_prepare_cpus,
.smp_cpus_done = xen_smp_cpus_done,
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 2e2d370a47b..5158c505bef 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -26,8 +26,6 @@
#include "xen-ops.h"
-#define XEN_SHIFT 22
-
/* Xen may fire a timer up to this many ns early */
#define TIMER_SLOP 100000
#define NS_PER_TICK (1000000000LL / HZ)
@@ -211,8 +209,6 @@ static struct clocksource xen_clocksource __read_mostly = {
.rating = 400,
.read = xen_clocksource_get_cycles,
.mask = ~0,
- .mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */
- .shift = XEN_SHIFT,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
@@ -439,16 +435,16 @@ void xen_timer_resume(void)
}
}
-static const struct pv_time_ops xen_time_ops __initdata = {
+static const struct pv_time_ops xen_time_ops __initconst = {
.sched_clock = xen_clocksource_read,
};
-static __init void xen_time_init(void)
+static void __init xen_time_init(void)
{
int cpu = smp_processor_id();
struct timespec tp;
- clocksource_register(&xen_clocksource);
+ clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC);
if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
/* Successfully turned off 100Hz tick, so we have the
@@ -468,7 +464,7 @@ static __init void xen_time_init(void)
xen_setup_cpu_clockevents();
}
-__init void xen_init_time_ops(void)
+void __init xen_init_time_ops(void)
{
pv_time_ops = xen_time_ops;
@@ -490,7 +486,7 @@ static void xen_hvm_setup_cpu_clockevents(void)
xen_setup_cpu_clockevents();
}
-__init void xen_hvm_init_time_ops(void)
+void __init xen_hvm_init_time_ops(void)
{
/* vector callback is needed otherwise we cannot receive interrupts
* on cpu > 0 and at this point we don't know how many cpus are
diff --git a/arch/x86/xen/trace.c b/arch/x86/xen/trace.c
new file mode 100644
index 00000000000..734beba2a08
--- /dev/null
+++ b/arch/x86/xen/trace.c
@@ -0,0 +1,61 @@
+#include <linux/ftrace.h>
+
+#define N(x) [__HYPERVISOR_##x] = "("#x")"
+static const char *xen_hypercall_names[] = {
+ N(set_trap_table),
+ N(mmu_update),
+ N(set_gdt),
+ N(stack_switch),
+ N(set_callbacks),
+ N(fpu_taskswitch),
+ N(sched_op_compat),
+ N(dom0_op),
+ N(set_debugreg),
+ N(get_debugreg),
+ N(update_descriptor),
+ N(memory_op),
+ N(multicall),
+ N(update_va_mapping),
+ N(set_timer_op),
+ N(event_channel_op_compat),
+ N(xen_version),
+ N(console_io),
+ N(physdev_op_compat),
+ N(grant_table_op),
+ N(vm_assist),
+ N(update_va_mapping_otherdomain),
+ N(iret),
+ N(vcpu_op),
+ N(set_segment_base),
+ N(mmuext_op),
+ N(acm_op),
+ N(nmi_op),
+ N(sched_op),
+ N(callback_op),
+ N(xenoprof_op),
+ N(event_channel_op),
+ N(physdev_op),
+ N(hvm_op),
+
+/* Architecture-specific hypercall definitions. */
+ N(arch_0),
+ N(arch_1),
+ N(arch_2),
+ N(arch_3),
+ N(arch_4),
+ N(arch_5),
+ N(arch_6),
+ N(arch_7),
+};
+#undef N
+
+static const char *xen_hypercall_name(unsigned op)
+{
+ if (op < ARRAY_SIZE(xen_hypercall_names) && xen_hypercall_names[op] != NULL)
+ return xen_hypercall_names[op];
+
+ return "";
+}
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/xen.h>
diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c
new file mode 100644
index 00000000000..1cd7f4d11e2
--- /dev/null
+++ b/arch/x86/xen/vga.c
@@ -0,0 +1,67 @@
+#include <linux/screen_info.h>
+#include <linux/init.h>
+
+#include <asm/bootparam.h>
+#include <asm/setup.h>
+
+#include <xen/interface/xen.h>
+
+#include "xen-ops.h"
+
+void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size)
+{
+ struct screen_info *screen_info = &boot_params.screen_info;
+
+ /* This is drawn from a dump from vgacon:startup in
+ * standard Linux. */
+ screen_info->orig_video_mode = 3;
+ screen_info->orig_video_isVGA = 1;
+ screen_info->orig_video_lines = 25;
+ screen_info->orig_video_cols = 80;
+ screen_info->orig_video_ega_bx = 3;
+ screen_info->orig_video_points = 16;
+ screen_info->orig_y = screen_info->orig_video_lines - 1;
+
+ switch (info->video_type) {
+ case XEN_VGATYPE_TEXT_MODE_3:
+ if (size < offsetof(struct dom0_vga_console_info, u.text_mode_3)
+ + sizeof(info->u.text_mode_3))
+ break;
+ screen_info->orig_video_lines = info->u.text_mode_3.rows;
+ screen_info->orig_video_cols = info->u.text_mode_3.columns;
+ screen_info->orig_x = info->u.text_mode_3.cursor_x;
+ screen_info->orig_y = info->u.text_mode_3.cursor_y;
+ screen_info->orig_video_points =
+ info->u.text_mode_3.font_height;
+ break;
+
+ case XEN_VGATYPE_VESA_LFB:
+ if (size < offsetof(struct dom0_vga_console_info,
+ u.vesa_lfb.gbl_caps))
+ break;
+ screen_info->orig_video_isVGA = VIDEO_TYPE_VLFB;
+ screen_info->lfb_width = info->u.vesa_lfb.width;
+ screen_info->lfb_height = info->u.vesa_lfb.height;
+ screen_info->lfb_depth = info->u.vesa_lfb.bits_per_pixel;
+ screen_info->lfb_base = info->u.vesa_lfb.lfb_base;
+ screen_info->lfb_size = info->u.vesa_lfb.lfb_size;
+ screen_info->lfb_linelength = info->u.vesa_lfb.bytes_per_line;
+ screen_info->red_size = info->u.vesa_lfb.red_size;
+ screen_info->red_pos = info->u.vesa_lfb.red_pos;
+ screen_info->green_size = info->u.vesa_lfb.green_size;
+ screen_info->green_pos = info->u.vesa_lfb.green_pos;
+ screen_info->blue_size = info->u.vesa_lfb.blue_size;
+ screen_info->blue_pos = info->u.vesa_lfb.blue_pos;
+ screen_info->rsvd_size = info->u.vesa_lfb.rsvd_size;
+ screen_info->rsvd_pos = info->u.vesa_lfb.rsvd_pos;
+ if (size >= offsetof(struct dom0_vga_console_info,
+ u.vesa_lfb.gbl_caps)
+ + sizeof(info->u.vesa_lfb.gbl_caps))
+ screen_info->capabilities = info->u.vesa_lfb.gbl_caps;
+ if (size >= offsetof(struct dom0_vga_console_info,
+ u.vesa_lfb.mode_attrs)
+ + sizeof(info->u.vesa_lfb.mode_attrs))
+ screen_info->vesa_attributes = info->u.vesa_lfb.mode_attrs;
+ break;
+ }
+}
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 3112f55638c..b095739ccd4 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -74,7 +74,7 @@ static inline void xen_hvm_smp_init(void) {}
#ifdef CONFIG_PARAVIRT_SPINLOCKS
void __init xen_init_spinlocks(void);
-__cpuinit void xen_init_lock_cpu(int cpu);
+void __cpuinit xen_init_lock_cpu(int cpu);
void xen_uninit_lock_cpu(int cpu);
#else
static inline void xen_init_spinlocks(void)
@@ -88,6 +88,17 @@ static inline void xen_uninit_lock_cpu(int cpu)
}
#endif
+struct dom0_vga_console_info;
+
+#ifdef CONFIG_XEN_DOM0
+void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size);
+#else
+static inline void __init xen_init_vga(const struct dom0_vga_console_info *info,
+ size_t size)
+{
+}
+#endif
+
/* Declare an asm function, along with symbols needed to make it
inlineable */
#define DECL_ASM(ret, name, ...) \