diff options
author | Ingo Molnar <mingo@elte.hu> | 2008-12-29 09:45:15 +0100 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-12-29 09:45:15 +0100 |
commit | e1df957670aef74ffd9a4ad93e6d2c90bf6b4845 (patch) | |
tree | bca1fcfef55b3e3e82c9a822b4ac6428fce2b419 /arch/x86/kernel | |
parent | 2b583d8bc8d7105b58d7481a4a0ceb718dac49c6 (diff) | |
parent | 3c92ec8ae91ecf59d88c798301833d7cf83f2179 (diff) |
Merge branch 'linus' into perfcounters/core
Conflicts:
fs/exec.c
include/linux/init_task.h
Simple context conflicts.
Diffstat (limited to 'arch/x86/kernel')
80 files changed, 3227 insertions, 2691 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 3d4346a73a8..88dd768eab6 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -26,7 +26,7 @@ CFLAGS_tsc.o := $(nostackp) obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o -obj-y += time_$(BITS).o ioport.o ldt.o +obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o obj-$(CONFIG_X86_VISWS) += visws_quirks.o obj-$(CONFIG_X86_32) += probe_roms_32.o @@ -66,6 +66,7 @@ obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o +obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o @@ -106,6 +107,8 @@ microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o obj-$(CONFIG_MICROCODE) += microcode.o +obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 4c51a2f8fd3..65d0b72777e 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1360,6 +1360,17 @@ static void __init acpi_process_madt(void) disable_acpi(); } } + + /* + * ACPI supports both logical (e.g. Hyper-Threading) and physical + * processors, where MPS only supports physical. + */ + if (acpi_lapic && acpi_ioapic) + printk(KERN_INFO "Using ACPI (MADT) for SMP configuration " + "information\n"); + else if (acpi_lapic) + printk(KERN_INFO "Using ACPI for processor (LAPIC) " + "configuration information\n"); #endif return; } diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index a7b6dec6fc3..2e2da717b35 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -24,6 +24,7 @@ #include <linux/iommu-helper.h> #include <asm/proto.h> #include <asm/iommu.h> +#include <asm/gart.h> #include <asm/amd_iommu_types.h> #include <asm/amd_iommu.h> @@ -235,8 +236,9 @@ static int iommu_completion_wait(struct amd_iommu *iommu) status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); - if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit())) - printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n"); + if (unlikely(i == EXIT_LOOP_COUNT)) + panic("AMD IOMMU: Completion wait loop failed\n"); + out: spin_unlock_irqrestore(&iommu->lock, flags); diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 30ae2701b3d..c625800c55c 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -28,6 +28,7 @@ #include <asm/amd_iommu_types.h> #include <asm/amd_iommu.h> #include <asm/iommu.h> +#include <asm/gart.h> /* * definitions for the ACPI scanning code @@ -427,6 +428,10 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, &entry, sizeof(entry)); + /* set head and tail to zero manually */ + writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); + writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); + iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); return cmd_buf; @@ -1074,7 +1079,8 @@ int __init amd_iommu_init(void) goto free; /* IOMMU rlookup table - find the IOMMU for a specific device */ - amd_iommu_rlookup_table = (void *)__get_free_pages(GFP_KERNEL, + amd_iommu_rlookup_table = (void *)__get_free_pages( + GFP_KERNEL | __GFP_ZERO, get_order(rlookup_table_size)); if (amd_iommu_rlookup_table == NULL) goto free; diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 9a32b37ee2e..676debfc170 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -1,8 +1,9 @@ /* * Firmware replacement code. * - * Work around broken BIOSes that don't set an aperture or only set the - * aperture in the AGP bridge. + * Work around broken BIOSes that don't set an aperture, only set the + * aperture in the AGP bridge, or set too small aperture. + * * If all fails map the aperture over some low memory. This is cheaper than * doing bounce buffering. The memory is lost. This is done at early boot * because only the bootmem allocator can allocate 32+MB. diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 4f859acb156..6c83ac10e6d 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -30,6 +30,7 @@ #include <linux/module.h> #include <linux/dmi.h> #include <linux/dmar.h> +#include <linux/ftrace.h> #include <asm/perf_counter.h> #include <asm/atomic.h> @@ -442,6 +443,7 @@ static void lapic_timer_setup(enum clock_event_mode mode, v = apic_read(APIC_LVTT); v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); apic_write(APIC_LVTT, v); + apic_write(APIC_TMICT, 0xffffffff); break; case CLOCK_EVT_MODE_RESUME: /* Nothing to do here */ @@ -560,13 +562,13 @@ static int __init calibrate_by_pmtimer(long deltapm, long *delta) } else { res = (((u64)deltapm) * mult) >> 22; do_div(res, 1000000); - printk(KERN_WARNING "APIC calibration not consistent " + pr_warning("APIC calibration not consistent " "with PM Timer: %ldms instead of 100ms\n", (long)res); /* Correct the lapic counter value */ res = (((u64)(*delta)) * pm_100ms); do_div(res, deltapm); - printk(KERN_INFO "APIC delta adjusted to PM-Timer: " + pr_info("APIC delta adjusted to PM-Timer: " "%lu (%ld)\n", (unsigned long)res, *delta); *delta = (long)res; } @@ -646,8 +648,7 @@ static int __init calibrate_APIC_clock(void) */ if (calibration_result < (1000000 / HZ)) { local_irq_enable(); - printk(KERN_WARNING - "APIC frequency too slow, disabling apic timer\n"); + pr_warning("APIC frequency too slow, disabling apic timer\n"); return -1; } @@ -673,13 +674,9 @@ static int __init calibrate_APIC_clock(void) while (lapic_cal_loops <= LAPIC_CAL_LOOPS) cpu_relax(); - local_irq_disable(); - /* Stop the lapic timer */ lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt); - local_irq_enable(); - /* Jiffies delta */ deltaj = lapic_cal_j2 - lapic_cal_j1; apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj); @@ -693,8 +690,7 @@ static int __init calibrate_APIC_clock(void) local_irq_enable(); if (levt->features & CLOCK_EVT_FEAT_DUMMY) { - printk(KERN_WARNING - "APIC timer disabled due to verification failure.\n"); + pr_warning("APIC timer disabled due to verification failure.\n"); return -1; } @@ -715,7 +711,7 @@ void __init setup_boot_APIC_clock(void) * broadcast mechanism is used. On UP systems simply ignore it. */ if (disable_apic_timer) { - printk(KERN_INFO "Disabling APIC timer\n"); + pr_info("Disabling APIC timer\n"); /* No broadcast on UP ! */ if (num_possible_cpus() > 1) { lapic_clockevent.mult = 1; @@ -742,7 +738,7 @@ void __init setup_boot_APIC_clock(void) if (nmi_watchdog != NMI_IO_APIC) lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; else - printk(KERN_WARNING "APIC timer registered as dummy," + pr_warning("APIC timer registered as dummy," " due to nmi_watchdog=%d!\n", nmi_watchdog); /* Setup the lapic or request the broadcast */ @@ -774,8 +770,7 @@ static void local_apic_timer_interrupt(void) * spurious. */ if (!evt->event_handler) { - printk(KERN_WARNING - "Spurious LAPIC timer interrupt on cpu %d\n", cpu); + pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu); /* Switch it off */ lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt); return; @@ -797,7 +792,7 @@ static void local_apic_timer_interrupt(void) * [ if a single-CPU system runs an SMP kernel then we call the local * interrupt as well. Thus we cannot inline the local irq ... ] */ -void smp_apic_timer_interrupt(struct pt_regs *regs) +void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -811,9 +806,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs) * Besides, if we don't timer interrupts ignore the global * interrupt lock, which is the WrongThing (tm) to do. */ -#ifdef CONFIG_X86_64 exit_idle(); -#endif irq_enter(); local_apic_timer_interrupt(); irq_exit(); @@ -1090,7 +1083,7 @@ static void __cpuinit lapic_setup_esr(void) unsigned int oldvalue, value, maxlvt; if (!lapic_is_integrated()) { - printk(KERN_INFO "No ESR for 82489DX.\n"); + pr_info("No ESR for 82489DX.\n"); return; } @@ -1101,7 +1094,7 @@ static void __cpuinit lapic_setup_esr(void) * ESR disabled - we can't do anything useful with the * errors anyway - mbligh */ - printk(KERN_INFO "Leaving ESR disabled.\n"); + pr_info("Leaving ESR disabled.\n"); return; } @@ -1296,7 +1289,7 @@ void check_x2apic(void) rdmsr(MSR_IA32_APICBASE, msr, msr2); if (msr & X2APIC_ENABLE) { - printk("x2apic enabled by BIOS, switching to x2apic ops\n"); + pr_info("x2apic enabled by BIOS, switching to x2apic ops\n"); x2apic_preenabled = x2apic = 1; apic_ops = &x2apic_ops; } @@ -1308,7 +1301,7 @@ void enable_x2apic(void) rdmsr(MSR_IA32_APICBASE, msr, msr2); if (!(msr & X2APIC_ENABLE)) { - printk("Enabling x2apic\n"); + pr_info("Enabling x2apic\n"); wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); } } @@ -1323,9 +1316,8 @@ void __init enable_IR_x2apic(void) return; if (!x2apic_preenabled && disable_x2apic) { - printk(KERN_INFO - "Skipped enabling x2apic and Interrupt-remapping " - "because of nox2apic\n"); + pr_info("Skipped enabling x2apic and Interrupt-remapping " + "because of nox2apic\n"); return; } @@ -1333,22 +1325,19 @@ void __init enable_IR_x2apic(void) panic("Bios already enabled x2apic, can't enforce nox2apic"); if (!x2apic_preenabled && skip_ioapic_setup) { - printk(KERN_INFO - "Skipped enabling x2apic and Interrupt-remapping " - "because of skipping io-apic setup\n"); + pr_info("Skipped enabling x2apic and Interrupt-remapping " + "because of skipping io-apic setup\n"); return; } ret = dmar_table_init(); if (ret) { - printk(KERN_INFO - "dmar_table_init() failed with %d:\n", ret); + pr_info("dmar_table_init() failed with %d:\n", ret); if (x2apic_preenabled) panic("x2apic enabled by bios. But IR enabling failed"); else - printk(KERN_INFO - "Not enabling x2apic,Intr-remapping\n"); + pr_info("Not enabling x2apic,Intr-remapping\n"); return; } @@ -1357,7 +1346,7 @@ void __init enable_IR_x2apic(void) ret = save_mask_IO_APIC_setup(); if (ret) { - printk(KERN_INFO "Saving IO-APIC state failed: %d\n", ret); + pr_info("Saving IO-APIC state failed: %d\n", ret); goto end; } @@ -1392,14 +1381,11 @@ end: if (!ret) { if (!x2apic_preenabled) - printk(KERN_INFO - "Enabled x2apic and interrupt-remapping\n"); + pr_info("Enabled x2apic and interrupt-remapping\n"); else - printk(KERN_INFO - "Enabled Interrupt-remapping\n"); + pr_info("Enabled Interrupt-remapping\n"); } else - printk(KERN_ERR - "Failed to enable Interrupt-remapping and x2apic\n"); + pr_err("Failed to enable Interrupt-remapping and x2apic\n"); #else if (!cpu_has_x2apic) return; @@ -1408,8 +1394,8 @@ end: panic("x2apic enabled prior OS handover," " enable CONFIG_INTR_REMAP"); - printk(KERN_INFO "Enable CONFIG_INTR_REMAP for enabling intr-remapping " - " and x2apic\n"); + pr_info("Enable CONFIG_INTR_REMAP for enabling intr-remapping " + " and x2apic\n"); #endif return; @@ -1426,7 +1412,7 @@ end: static int __init detect_init_APIC(void) { if (!cpu_has_apic) { - printk(KERN_INFO "No local APIC present\n"); + pr_info("No local APIC present\n"); return -1; } @@ -1467,8 +1453,8 @@ static int __init detect_init_APIC(void) * "lapic" specified. */ if (!force_enable_local_apic) { - printk(KERN_INFO "Local APIC disabled by BIOS -- " - "you can enable it with \"lapic\"\n"); + pr_info("Local APIC disabled by BIOS -- " + "you can enable it with \"lapic\"\n"); return -1; } /* @@ -1478,8 +1464,7 @@ static int __init detect_init_APIC(void) */ rdmsr(MSR_IA32_APICBASE, l, h); if (!(l & MSR_IA32_APICBASE_ENABLE)) { - printk(KERN_INFO - "Local APIC disabled by BIOS -- reenabling.\n"); + pr_info("Local APIC disabled by BIOS -- reenabling.\n"); l &= ~MSR_IA32_APICBASE_BASE; l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; wrmsr(MSR_IA32_APICBASE, l, h); @@ -1492,7 +1477,7 @@ static int __init detect_init_APIC(void) */ features = cpuid_edx(1); if (!(features & (1 << X86_FEATURE_APIC))) { - printk(KERN_WARNING "Could not enable APIC!\n"); + pr_warning("Could not enable APIC!\n"); return -1; } set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); @@ -1503,14 +1488,14 @@ static int __init detect_init_APIC(void) if (l & MSR_IA32_APICBASE_ENABLE) mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; - printk(KERN_INFO "Found and enabled local APIC!\n"); + pr_info("Found and enabled local APIC!\n"); apic_pm_activate(); return 0; no_apic: - printk(KERN_INFO "No local APIC present or hardware disabled\n"); + pr_info("No local APIC present or hardware disabled\n"); return -1; } #endif @@ -1586,12 +1571,12 @@ int __init APIC_init_uniprocessor(void) { #ifdef CONFIG_X86_64 if (disable_apic) { - printk(KERN_INFO "Apic disabled\n"); + pr_info("Apic disabled\n"); return -1; } if (!cpu_has_apic) { disable_apic = 1; - printk(KERN_INFO "Apic disabled by BIOS\n"); + pr_info("Apic disabled by BIOS\n"); return -1; } #else @@ -1603,8 +1588,8 @@ int __init APIC_init_uniprocessor(void) */ if (!cpu_has_apic && APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { - printk(KERN_ERR "BIOS bug, local APIC 0x%x not detected!...\n", - boot_cpu_physical_apicid); + pr_err("BIOS bug, local APIC 0x%x not detected!...\n", + boot_cpu_physical_apicid); clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); return -1; } @@ -1680,9 +1665,7 @@ void smp_spurious_interrupt(struct pt_regs *regs) { u32 v; -#ifdef CONFIG_X86_64 exit_idle(); -#endif irq_enter(); /* * Check if this really is a spurious interrupt and ACK it @@ -1696,8 +1679,8 @@ void smp_spurious_interrupt(struct pt_regs *regs) inc_irq_stat(irq_spurious_count); /* see sw-dev-man vol 3, chapter 7.4.13.5 */ - printk(KERN_INFO "spurious APIC interrupt on CPU#%d, " - "should never happen.\n", smp_processor_id()); + pr_info("spurious APIC interrupt on CPU#%d, " + "should never happen.\n", smp_processor_id()); irq_exit(); } @@ -1708,9 +1691,7 @@ void smp_error_interrupt(struct pt_regs *regs) { u32 v, v1; -#ifdef CONFIG_X86_64 exit_idle(); -#endif irq_enter(); /* First tickle the hardware, only then report what went on. -- REW */ v = apic_read(APIC_ESR); @@ -1719,17 +1700,18 @@ void smp_error_interrupt(struct pt_regs *regs) ack_APIC_irq(); atomic_inc(&irq_err_count); - /* Here is what the APIC error bits mean: - 0: Send CS error - 1: Receive CS error - 2: Send accept error - 3: Receive accept error - 4: Reserved - 5: Send illegal vector - 6: Received illegal vector - 7: Illegal register address - */ - printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", + /* + * Here is what the APIC error bits mean: + * 0: Send CS error + * 1: Receive CS error + * 2: Send accept error + * 3: Receive accept error + * 4: Reserved + * 5: Send illegal vector + * 6: Received illegal vector + * 7: Illegal register address + */ + pr_debug("APIC error on CPU%d: %02x(%02x)\n", smp_processor_id(), v , v1); irq_exit(); } @@ -1833,15 +1815,15 @@ void __cpuinit generic_processor_info(int apicid, int version) * Validate version */ if (version == 0x0) { - printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! " - "fixing up to 0x10. (tell your hw vendor)\n", - version); + pr_warning("BIOS bug, APIC version is 0 for CPU#%d! " + "fixing up to 0x10. (tell your hw vendor)\n", + version); version = 0x10; } apic_version[apicid] = version; if (num_processors >= NR_CPUS) { - printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." + pr_warning("WARNING: NR_CPUS limit of %i reached." " Processor ignored.\n", NR_CPUS); return; } @@ -2204,7 +2186,7 @@ static int __init apic_set_verbosity(char *arg) else if (strcmp("verbose", arg) == 0) apic_verbosity = APIC_VERBOSE; else { - printk(KERN_WARNING "APIC Verbosity level %s not recognised" + pr_warning("APIC Verbosity level %s not recognised" " use apic=verbose or apic=debug\n", arg); return -EINVAL; } diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 5145a6e72bb..3a26525a3f3 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -391,11 +391,7 @@ static int power_off; #else static int power_off = 1; #endif -#ifdef CONFIG_APM_REAL_MODE_POWER_OFF -static int realmode_power_off = 1; -#else static int realmode_power_off; -#endif #ifdef CONFIG_APM_ALLOW_INTS static int allow_ints = 1; #else diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 6649d09ad88..ee4df08feee 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -11,7 +11,7 @@ #include <linux/suspend.h> #include <linux/kbuild.h> #include <asm/ucontext.h> -#include "sigframe.h" +#include <asm/sigframe.h> #include <asm/pgtable.h> #include <asm/fixmap.h> #include <asm/processor.h> diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 7fcf63d22f8..1d41d3f1edb 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -20,6 +20,8 @@ #include <xen/interface/xen.h> +#include <asm/sigframe.h> + #define __NO_STUBS 1 #undef __SYSCALL #undef _ASM_X86_UNISTD_64_H @@ -87,7 +89,7 @@ int main(void) BLANK(); #undef ENTRY DEFINE(IA32_RT_SIGFRAME_sigcontext, - offsetof (struct rt_sigframe32, uc.uc_mcontext)); + offsetof (struct rt_sigframe_ia32, uc.uc_mcontext)); BLANK(); #endif DEFINE(pbe_address, offsetof(struct pbe, address)); diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index f0dfe6f17e7..2a0a2a3cac2 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -69,10 +69,10 @@ s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, long sn_partition_id; EXPORT_SYMBOL_GPL(sn_partition_id); -long uv_coherency_id; -EXPORT_SYMBOL_GPL(uv_coherency_id); -long uv_region_size; -EXPORT_SYMBOL_GPL(uv_region_size); +long sn_coherency_id; +EXPORT_SYMBOL_GPL(sn_coherency_id); +long sn_region_size; +EXPORT_SYMBOL_GPL(sn_region_size); int uv_type; @@ -100,6 +100,56 @@ s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, return ret; } +int +uv_bios_mq_watchlist_alloc(int blade, unsigned long addr, unsigned int mq_size, + unsigned long *intr_mmr_offset) +{ + union uv_watchlist_u size_blade; + u64 watchlist; + s64 ret; + + size_blade.size = mq_size; + size_blade.blade = blade; + + /* + * bios returns watchlist number or negative error number. + */ + ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr, + size_blade.val, (u64)intr_mmr_offset, + (u64)&watchlist, 0); + if (ret < BIOS_STATUS_SUCCESS) + return ret; + + return watchlist; +} +EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_alloc); + +int +uv_bios_mq_watchlist_free(int blade, int watchlist_num) +{ + return (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_FREE, + blade, watchlist_num, 0, 0, 0); +} +EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_free); + +s64 +uv_bios_change_memprotect(u64 paddr, u64 len, enum uv_memprotect perms) +{ + return uv_bios_call_irqsave(UV_BIOS_MEMPROTECT, paddr, len, + perms, 0, 0); +} +EXPORT_SYMBOL_GPL(uv_bios_change_memprotect); + +s64 +uv_bios_reserved_page_pa(u64 buf, u64 *cookie, u64 *addr, u64 *len) +{ + s64 ret; + + ret = uv_bios_call_irqsave(UV_BIOS_GET_PARTITION_ADDR, (u64)cookie, + (u64)addr, buf, (u64)len, 0); + return ret; +} +EXPORT_SYMBOL_GPL(uv_bios_reserved_page_pa); s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second) { diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c new file mode 100644 index 00000000000..2ac0ab71412 --- /dev/null +++ b/arch/x86/kernel/check.c @@ -0,0 +1,161 @@ +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/kthread.h> +#include <linux/workqueue.h> +#include <asm/e820.h> +#include <asm/proto.h> + +/* + * Some BIOSes seem to corrupt the low 64k of memory during events + * like suspend/resume and unplugging an HDMI cable. Reserve all + * remaining free memory in that area and fill it with a distinct + * pattern. + */ +#define MAX_SCAN_AREAS 8 + +static int __read_mostly memory_corruption_check = -1; + +static unsigned __read_mostly corruption_check_size = 64*1024; +static unsigned __read_mostly corruption_check_period = 60; /* seconds */ + +static struct e820entry scan_areas[MAX_SCAN_AREAS]; +static int num_scan_areas; + + +static __init int set_corruption_check(char *arg) +{ + char *end; + + memory_corruption_check = simple_strtol(arg, &end, 10); + + return (*end == 0) ? 0 : -EINVAL; +} +early_param("memory_corruption_check", set_corruption_check); + +static __init int set_corruption_check_period(char *arg) +{ + char *end; + + corruption_check_period = simple_strtoul(arg, &end, 10); + + return (*end == 0) ? 0 : -EINVAL; +} +early_param("memory_corruption_check_period", set_corruption_check_period); + +static __init int set_corruption_check_size(char *arg) +{ + char *end; + unsigned size; + + size = memparse(arg, &end); + + if (*end == '\0') + corruption_check_size = size; + + return (size == corruption_check_size) ? 0 : -EINVAL; +} +early_param("memory_corruption_check_size", set_corruption_check_size); + + +void __init setup_bios_corruption_check(void) +{ + u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */ + + if (memory_corruption_check == -1) { + memory_corruption_check = +#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK + 1 +#else + 0 +#endif + ; + } + + if (corruption_check_size == 0) + memory_corruption_check = 0; + + if (!memory_corruption_check) + return; + + corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); + + while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { + u64 size; + addr = find_e820_area_size(addr, &size, PAGE_SIZE); + + if (addr == 0) + break; + + if ((addr + size) > corruption_check_size) + size = corruption_check_size - addr; + + if (size == 0) + break; + + e820_update_range(addr, size, E820_RAM, E820_RESERVED); + scan_areas[num_scan_areas].addr = addr; + scan_areas[num_scan_areas].size = size; + num_scan_areas++; + + /* Assume we've already mapped this early memory */ + memset(__va(addr), 0, size); + + addr += size; + } + + printk(KERN_INFO "Scanning %d areas for low memory corruption\n", + num_scan_areas); + update_e820(); +} + + +void check_for_bios_corruption(void) +{ + int i; + int corruption = 0; + + if (!memory_corruption_check) + return; + + for (i = 0; i < num_scan_areas; i++) { + unsigned long *addr = __va(scan_areas[i].addr); + unsigned long size = scan_areas[i].size; + + for (; size; addr++, size -= sizeof(unsigned long)) { + if (!*addr) + continue; + printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n", + addr, __pa(addr), *addr); + corruption = 1; + *addr = 0; + } + } + + WARN_ONCE(corruption, KERN_ERR "Memory corruption detected in low memory\n"); +} + +static void check_corruption(struct work_struct *dummy); +static DECLARE_DELAYED_WORK(bios_check_work, check_corruption); + +static void check_corruption(struct work_struct *dummy) +{ + check_for_bios_corruption(); + schedule_delayed_work(&bios_check_work, + round_jiffies_relative(corruption_check_period*HZ)); +} + +static int start_periodic_check_for_corruption(void) +{ + if (!memory_corruption_check || corruption_check_period == 0) + return 0; + + printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n", + corruption_check_period); + + /* First time we run the checks right away */ + schedule_delayed_work(&bios_check_work, 0); + return 0; +} + +module_init(start_periodic_check_for_corruption); + diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 89e53361fe2..c3813306e0b 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -2,8 +2,14 @@ # Makefile for x86-compatible CPU details, features and quirks # +# Don't trace early stages of a secondary CPU boot +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_common.o = -pg +endif + obj-y := intel_cacheinfo.o addon_cpuid_features.o obj-y += proc.o capflags.o powerflags.o common.o +obj-y += vmware.o hypervisor.o obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o obj-$(CONFIG_X86_64) += bugs_64.o diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index ef8f831af82..2cf23634b6d 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -120,9 +120,17 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c) c->cpu_core_id = phys_pkg_id(c->initial_apicid, ht_mask_width) & core_select_mask; c->phys_proc_id = phys_pkg_id(c->initial_apicid, core_plus_mask_width); + /* + * Reinit the apicid, now that we have extended initial_apicid. + */ + c->apicid = phys_pkg_id(c->initial_apicid, 0); #else c->cpu_core_id = phys_pkg_id(ht_mask_width) & core_select_mask; c->phys_proc_id = phys_pkg_id(core_plus_mask_width); + /* + * Reinit the apicid, now that we have extended initial_apicid. + */ + c->apicid = phys_pkg_id(0); #endif c->x86_max_cores = (core_level_siblings / smp_num_siblings); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 8f1e31db2ad..7c878f6aa91 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -283,9 +283,14 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) { early_init_amd_mc(c); - /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ - if (c->x86_power & (1<<8)) + /* + * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate + * with P/T states and does not stop in deep C-states + */ + if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + } #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSCALL32); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index ad331b4d623..376b9f9d8d2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -37,6 +37,7 @@ #include <asm/proto.h> #include <asm/sections.h> #include <asm/setup.h> +#include <asm/hypervisor.h> #include "cpu.h" @@ -704,6 +705,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) detect_ht(c); #endif + init_hypervisor(c); /* * On SMP, boot_cpu_data holds the common feature set between * all CPUs; so make sure that we indicate which features are @@ -864,7 +866,7 @@ EXPORT_SYMBOL(_cpu_pda); struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; -char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; +static char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; void __cpuinit pda_init(int cpu) { @@ -905,8 +907,8 @@ void __cpuinit pda_init(int cpu) } } -char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + - DEBUG_STKSZ] __page_aligned_bss; +static char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + + DEBUG_STKSZ] __page_aligned_bss; extern asmlinkage void ignore_sysret(void); diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 8e48c5d4467..88ea02dcb62 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -33,6 +33,7 @@ #include <linux/cpufreq.h> #include <linux/compiler.h> #include <linux/dmi.h> +#include <linux/ftrace.h> #include <linux/acpi.h> #include <acpi/processor.h> @@ -391,6 +392,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, unsigned int next_perf_state = 0; /* Index into perf table */ unsigned int i; int result = 0; + struct power_trace it; dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu); @@ -427,6 +429,8 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, } } + trace_power_mark(&it, POWER_PSTATE, next_perf_state); + switch (data->cpu_feature) { case SYSTEM_INTEL_MSR_CAPABLE: cmd.type = SYSTEM_INTEL_MSR_CAPABLE; diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c new file mode 100644 index 00000000000..fb5b86af0b0 --- /dev/null +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -0,0 +1,58 @@ +/* + * Common hypervisor code + * + * Copyright (C) 2008, VMware, Inc. + * Author : Alok N Kataria <akataria@vmware.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include <asm/processor.h> +#include <asm/vmware.h> +#include <asm/hypervisor.h> + +static inline void __cpuinit +detect_hypervisor_vendor(struct cpuinfo_x86 *c) +{ + if (vmware_platform()) { + c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE; + } else { + c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE; + } +} + +unsigned long get_hypervisor_tsc_freq(void) +{ + if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) + return vmware_get_tsc_khz(); + return 0; +} + +static inline void __cpuinit +hypervisor_set_feature_bits(struct cpuinfo_x86 *c) +{ + if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) { + vmware_set_feature_bits(c); + return; + } +} + +void __cpuinit init_hypervisor(struct cpuinfo_x86 *c) +{ + detect_hypervisor_vendor(c); + hypervisor_set_feature_bits(c); +} diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index cce0b6118d5..8ea6929e974 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -11,7 +11,6 @@ #include <asm/pgtable.h> #include <asm/msr.h> #include <asm/uaccess.h> -#include <asm/ptrace.h> #include <asm/ds.h> #include <asm/bugs.h> @@ -41,6 +40,16 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) if (c->x86 == 15 && c->x86_cache_alignment == 64) c->x86_cache_alignment = 128; #endif + + /* + * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate + * with P/T states and does not stop in deep C-states + */ + if (c->x86_power & (1 << 8)) { + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + } + } #ifdef CONFIG_X86_32 @@ -242,6 +251,13 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) intel_workarounds(c); + /* + * Detect the extended topology information if available. This + * will reinitialise the initial_apicid which will be used + * in init_intel_cacheinfo() + */ + detect_extended_topology(c); + l2 = init_intel_cacheinfo(c); if (c->cpuid_level > 9) { unsigned eax = cpuid_eax(10); @@ -307,13 +323,8 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_P4); if (c->x86 == 6) set_cpu_cap(c, X86_FEATURE_P3); - - if (cpu_has_bts) - ptrace_bts_init_intel(c); - #endif - detect_extended_topology(c); if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { /* * let's use the legacy cpuid vector 0x1 and 0x4 for topology diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 3f46afbb1cf..68b5d8681cb 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -644,20 +644,17 @@ static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf) return show_shared_cpu_map_func(leaf, 1, buf); } -static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) { - switch(this_leaf->eax.split.type) { - case CACHE_TYPE_DATA: +static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) +{ + switch (this_leaf->eax.split.type) { + case CACHE_TYPE_DATA: return sprintf(buf, "Data\n"); - break; - case CACHE_TYPE_INST: + case CACHE_TYPE_INST: return sprintf(buf, "Instruction\n"); - break; - case CACHE_TYPE_UNIFIED: + case CACHE_TYPE_UNIFIED: return sprintf(buf, "Unified\n"); - break; - default: + default: return sprintf(buf, "Unknown\n"); - break; } } diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 4b031a4ac85..1c838032fd3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -510,12 +510,9 @@ static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c) */ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) { - static cpumask_t mce_cpus = CPU_MASK_NONE; - mce_cpu_quirks(c); if (mce_dont_init || - cpu_test_and_set(smp_processor_id(), mce_cpus) || !mce_available(c)) return; diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 5eb390a4b2e..748c8f9e7a0 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -237,7 +237,7 @@ asmlinkage void mce_threshold_interrupt(void) } } out: - add_pda(irq_threshold_count, 1); + inc_irq_stat(irq_threshold_count); irq_exit(); } diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index c17eaf5dd6d..4b48f251fd3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -26,7 +26,7 @@ asmlinkage void smp_thermal_interrupt(void) if (therm_throt_process(msr_val & 1)) mce_log_therm_throt_event(smp_processor_id(), msr_val); - add_pda(irq_thermal_count, 1); + inc_irq_stat(irq_thermal_count); irq_exit(); } diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index c78c04821ea..1159e269e59 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -803,6 +803,7 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, } static struct res_range __initdata range[RANGE_NUM]; +static int __initdata nr_range; #ifdef CONFIG_MTRR_SANITIZER @@ -1206,39 +1207,43 @@ struct mtrr_cleanup_result { #define PSHIFT (PAGE_SHIFT - 10) static struct mtrr_cleanup_result __initdata result[NUM_RESULT]; -static struct res_range __initdata range_new[RANGE_NUM]; static unsigned long __initdata min_loss_pfn[RANGE_NUM]; -static int __init mtrr_cleanup(unsigned address_bits) +static void __init print_out_mtrr_range_state(void) { - unsigned long extra_remove_base, extra_remove_size; - unsigned long base, size, def, dummy; - mtrr_type type; - int nr_range, nr_range_new; - u64 chunk_size, gran_size; - unsigned long range_sums, range_sums_new; - int index_good; - int num_reg_good; int i; + char start_factor = 'K', size_factor = 'K'; + unsigned long start_base, size_base; + mtrr_type type; - /* extra one for all 0 */ - int num[MTRR_NUM_TYPES + 1]; + for (i = 0; i < num_var_ranges; i++) { - if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) - return 0; - rdmsr(MTRRdefType_MSR, def, dummy); - def &= 0xff; - if (def != MTRR_TYPE_UNCACHABLE) - return 0; + size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10); + if (!size_base) + continue; - /* get it and store it aside */ - memset(range_state, 0, sizeof(range_state)); - for (i = 0; i < num_var_ranges; i++) { - mtrr_if->get(i, &base, &size, &type); - range_state[i].base_pfn = base; - range_state[i].size_pfn = size; - range_state[i].type = type; + size_base = to_size_factor(size_base, &size_factor), + start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10); + start_base = to_size_factor(start_base, &start_factor), + type = range_state[i].type; + + printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n", + i, start_base, start_factor, + size_base, size_factor, + (type == MTRR_TYPE_UNCACHABLE) ? "UC" : + ((type == MTRR_TYPE_WRPROT) ? "WP" : + ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")) + ); } +} + +static int __init mtrr_need_cleanup(void) +{ + int i; + mtrr_type type; + unsigned long size; + /* extra one for all 0 */ + int num[MTRR_NUM_TYPES + 1]; /* check entries number */ memset(num, 0, sizeof(num)); @@ -1263,29 +1268,133 @@ static int __init mtrr_cleanup(unsigned address_bits) num_var_ranges - num[MTRR_NUM_TYPES]) return 0; - /* print original var MTRRs at first, for debugging: */ - printk(KERN_DEBUG "original variable MTRRs\n"); - for (i = 0; i < num_var_ranges; i++) { - char start_factor = 'K', size_factor = 'K'; - unsigned long start_base, size_base; + return 1; +} - size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10); - if (!size_base) - continue; +static unsigned long __initdata range_sums; +static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size, + unsigned long extra_remove_base, + unsigned long extra_remove_size, + int i) +{ + int num_reg; + static struct res_range range_new[RANGE_NUM]; + static int nr_range_new; + unsigned long range_sums_new; + + /* convert ranges to var ranges state */ + num_reg = x86_setup_var_mtrrs(range, nr_range, + chunk_size, gran_size); + + /* we got new setting in range_state, check it */ + memset(range_new, 0, sizeof(range_new)); + nr_range_new = x86_get_mtrr_mem_range(range_new, 0, + extra_remove_base, extra_remove_size); + range_sums_new = sum_ranges(range_new, nr_range_new); + + result[i].chunk_sizek = chunk_size >> 10; + result[i].gran_sizek = gran_size >> 10; + result[i].num_reg = num_reg; + if (range_sums < range_sums_new) { + result[i].lose_cover_sizek = + (range_sums_new - range_sums) << PSHIFT; + result[i].bad = 1; + } else + result[i].lose_cover_sizek = + (range_sums - range_sums_new) << PSHIFT; - size_base = to_size_factor(size_base, &size_factor), - start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10); - start_base = to_size_factor(start_base, &start_factor), - type = range_state[i].type; + /* double check it */ + if (!result[i].bad && !result[i].lose_cover_sizek) { + if (nr_range_new != nr_range || + memcmp(range, range_new, sizeof(range))) + result[i].bad = 1; + } - printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n", - i, start_base, start_factor, - size_base, size_factor, - (type == MTRR_TYPE_UNCACHABLE) ? "UC" : - ((type == MTRR_TYPE_WRPROT) ? "WP" : - ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")) - ); + if (!result[i].bad && (range_sums - range_sums_new < + min_loss_pfn[num_reg])) { + min_loss_pfn[num_reg] = + range_sums - range_sums_new; } +} + +static void __init mtrr_print_out_one_result(int i) +{ + char gran_factor, chunk_factor, lose_factor; + unsigned long gran_base, chunk_base, lose_base; + + gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), + chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), + lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), + printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t", + result[i].bad ? "*BAD*" : " ", + gran_base, gran_factor, chunk_base, chunk_factor); + printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n", + result[i].num_reg, result[i].bad ? "-" : "", + lose_base, lose_factor); +} + +static int __init mtrr_search_optimal_index(void) +{ + int i; + int num_reg_good; + int index_good; + + if (nr_mtrr_spare_reg >= num_var_ranges) + nr_mtrr_spare_reg = num_var_ranges - 1; + num_reg_good = -1; + for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { + if (!min_loss_pfn[i]) + num_reg_good = i; + } + + index_good = -1; + if (num_reg_good != -1) { + for (i = 0; i < NUM_RESULT; i++) { + if (!result[i].bad && + result[i].num_reg == num_reg_good && + !result[i].lose_cover_sizek) { + index_good = i; + break; + } + } + } + + return index_good; +} + + +static int __init mtrr_cleanup(unsigned address_bits) +{ + unsigned long extra_remove_base, extra_remove_size; + unsigned long base, size, def, dummy; + mtrr_type type; + u64 chunk_size, gran_size; + int index_good; + int i; + + if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) + return 0; + rdmsr(MTRRdefType_MSR, def, dummy); + def &= 0xff; + if (def != MTRR_TYPE_UNCACHABLE) + return 0; + + /* get it and store it aside */ + memset(range_state, 0, sizeof(range_state)); + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, &base, &size, &type); + range_state[i].base_pfn = base; + range_state[i].size_pfn = size; + range_state[i].type = type; + } + + /* check if we need handle it and can handle it */ + if (!mtrr_need_cleanup()) + return 0; + + /* print original var MTRRs at first, for debugging: */ + printk(KERN_DEBUG "original variable MTRRs\n"); + print_out_mtrr_range_state(); memset(range, 0, sizeof(range)); extra_remove_size = 0; @@ -1309,176 +1418,64 @@ static int __init mtrr_cleanup(unsigned address_bits) range_sums >> (20 - PAGE_SHIFT)); if (mtrr_chunk_size && mtrr_gran_size) { - int num_reg; - char gran_factor, chunk_factor, lose_factor; - unsigned long gran_base, chunk_base, lose_base; - - debug_print++; - /* convert ranges to var ranges state */ - num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size, - mtrr_gran_size); + i = 0; + mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size, + extra_remove_base, extra_remove_size, i); - /* we got new setting in range_state, check it */ - memset(range_new, 0, sizeof(range_new)); - nr_range_new = x86_get_mtrr_mem_range(range_new, 0, - extra_remove_base, - extra_remove_size); - range_sums_new = sum_ranges(range_new, nr_range_new); + mtrr_print_out_one_result(i); - i = 0; - result[i].chunk_sizek = mtrr_chunk_size >> 10; - result[i].gran_sizek = mtrr_gran_size >> 10; - result[i].num_reg = num_reg; - if (range_sums < range_sums_new) { - result[i].lose_cover_sizek = - (range_sums_new - range_sums) << PSHIFT; - result[i].bad = 1; - } else - result[i].lose_cover_sizek = - (range_sums - range_sums_new) << PSHIFT; - - gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), - chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), - lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), - printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t", - result[i].bad?"*BAD*":" ", - gran_base, gran_factor, chunk_base, chunk_factor); - printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n", - result[i].num_reg, result[i].bad?"-":"", - lose_base, lose_factor); if (!result[i].bad) { set_var_mtrr_all(address_bits); return 1; } printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " "will find optimal one\n"); - debug_print--; - memset(result, 0, sizeof(result[0])); } i = 0; memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); memset(result, 0, sizeof(result)); for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) { - char gran_factor; - unsigned long gran_base; - - if (debug_print) - gran_base = to_size_factor(gran_size >> 10, &gran_factor); for (chunk_size = gran_size; chunk_size < (1ULL<<32); chunk_size <<= 1) { - int num_reg; - if (debug_print) { - char chunk_factor; - unsigned long chunk_base; - - chunk_base = to_size_factor(chunk_size>>10, &chunk_factor), - printk(KERN_INFO "\n"); - printk(KERN_INFO "gran_size: %ld%c chunk_size: %ld%c \n", - gran_base, gran_factor, chunk_base, chunk_factor); - } if (i >= NUM_RESULT) continue; - /* convert ranges to var ranges state */ - num_reg = x86_setup_var_mtrrs(range, nr_range, - chunk_size, gran_size); - - /* we got new setting in range_state, check it */ - memset(range_new, 0, sizeof(range_new)); - nr_range_new = x86_get_mtrr_mem_range(range_new, 0, - extra_remove_base, extra_remove_size); - range_sums_new = sum_ranges(range_new, nr_range_new); - - result[i].chunk_sizek = chunk_size >> 10; - result[i].gran_sizek = gran_size >> 10; - result[i].num_reg = num_reg; - if (range_sums < range_sums_new) { - result[i].lose_cover_sizek = - (range_sums_new - range_sums) << PSHIFT; - result[i].bad = 1; - } else - result[i].lose_cover_sizek = - (range_sums - range_sums_new) << PSHIFT; - - /* double check it */ - if (!result[i].bad && !result[i].lose_cover_sizek) { - if (nr_range_new != nr_range || - memcmp(range, range_new, sizeof(range))) - result[i].bad = 1; + mtrr_calc_range_state(chunk_size, gran_size, + extra_remove_base, extra_remove_size, i); + if (debug_print) { + mtrr_print_out_one_result(i); + printk(KERN_INFO "\n"); } - if (!result[i].bad && (range_sums - range_sums_new < - min_loss_pfn[num_reg])) { - min_loss_pfn[num_reg] = - range_sums - range_sums_new; - } i++; } } - /* print out all */ - for (i = 0; i < NUM_RESULT; i++) { - char gran_factor, chunk_factor, lose_factor; - unsigned long gran_base, chunk_base, lose_base; - - gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), - chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), - lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), - printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t", - result[i].bad?"*BAD*":" ", - gran_base, gran_factor, chunk_base, chunk_factor); - printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n", - result[i].num_reg, result[i].bad?"-":"", - lose_base, lose_factor); - } - /* try to find the optimal index */ - if (nr_mtrr_spare_reg >= num_var_ranges) - nr_mtrr_spare_reg = num_var_ranges - 1; - num_reg_good = -1; - for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { - if (!min_loss_pfn[i]) - num_reg_good = i; - } - - index_good = -1; - if (num_reg_good != -1) { - for (i = 0; i < NUM_RESULT; i++) { - if (!result[i].bad && - result[i].num_reg == num_reg_good && - !result[i].lose_cover_sizek) { - index_good = i; - break; - } - } - } + index_good = mtrr_search_optimal_index(); if (index_good != -1) { - char gran_factor, chunk_factor, lose_factor; - unsigned long gran_base, chunk_base, lose_base; - printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); i = index_good; - gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), - chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), - lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), - printk(KERN_INFO "gran_size: %ld%c \tchunk_size: %ld%c \t", - gran_base, gran_factor, chunk_base, chunk_factor); - printk(KERN_CONT "num_reg: %d \tlose RAM: %ld%c\n", - result[i].num_reg, lose_base, lose_factor); + mtrr_print_out_one_result(i); + /* convert ranges to var ranges state */ chunk_size = result[i].chunk_sizek; chunk_size <<= 10; gran_size = result[i].gran_sizek; gran_size <<= 10; - debug_print++; x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); - debug_print--; set_var_mtrr_all(address_bits); + printk(KERN_DEBUG "New variable MTRRs\n"); + print_out_mtrr_range_state(); return 1; + } else { + /* print out all */ + for (i = 0; i < NUM_RESULT; i++) + mtrr_print_out_one_result(i); } printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n"); @@ -1562,7 +1559,6 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) { unsigned long i, base, size, highest_pfn = 0, def, dummy; mtrr_type type; - int nr_range; u64 total_trim_size; /* extra one for all 0 */ diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c new file mode 100644 index 00000000000..284c399e323 --- /dev/null +++ b/arch/x86/kernel/cpu/vmware.c @@ -0,0 +1,112 @@ +/* + * VMware Detection code. + * + * Copyright (C) 2008, VMware, Inc. + * Author : Alok N Kataria <akataria@vmware.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include <linux/dmi.h> +#include <asm/div64.h> +#include <asm/vmware.h> + +#define CPUID_VMWARE_INFO_LEAF 0x40000000 +#define VMWARE_HYPERVISOR_MAGIC 0x564D5868 +#define VMWARE_HYPERVISOR_PORT 0x5658 + +#define VMWARE_PORT_CMD_GETVERSION 10 +#define VMWARE_PORT_CMD_GETHZ 45 + +#define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \ + __asm__("inl (%%dx)" : \ + "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \ + "0"(VMWARE_HYPERVISOR_MAGIC), \ + "1"(VMWARE_PORT_CMD_##cmd), \ + "2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) : \ + "memory"); + +static inline int __vmware_platform(void) +{ + uint32_t eax, ebx, ecx, edx; + VMWARE_PORT(GETVERSION, eax, ebx, ecx, edx); + return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC; +} + +static unsigned long __vmware_get_tsc_khz(void) +{ + uint64_t tsc_hz; + uint32_t eax, ebx, ecx, edx; + + VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); + + if (ebx == UINT_MAX) + return 0; + tsc_hz = eax | (((uint64_t)ebx) << 32); + do_div(tsc_hz, 1000); + BUG_ON(tsc_hz >> 32); + return tsc_hz; +} + +/* + * While checking the dmi string infomation, just checking the product + * serial key should be enough, as this will always have a VMware + * specific string when running under VMware hypervisor. + */ +int vmware_platform(void) +{ + if (cpu_has_hypervisor) { + unsigned int eax, ebx, ecx, edx; + char hyper_vendor_id[13]; + + cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &ebx, &ecx, &edx); + memcpy(hyper_vendor_id + 0, &ebx, 4); + memcpy(hyper_vendor_id + 4, &ecx, 4); + memcpy(hyper_vendor_id + 8, &edx, 4); + hyper_vendor_id[12] = '\0'; + if (!strcmp(hyper_vendor_id, "VMwareVMware")) + return 1; + } else if (dmi_available && dmi_name_in_serial("VMware") && + __vmware_platform()) + return 1; + + return 0; +} + +unsigned long vmware_get_tsc_khz(void) +{ + BUG_ON(!vmware_platform()); + return __vmware_get_tsc_khz(); +} + +/* + * VMware hypervisor takes care of exporting a reliable TSC to the guest. + * Still, due to timing difference when running on virtual cpus, the TSC can + * be marked as unstable in some cases. For example, the TSC sync check at + * bootup can fail due to a marginal offset between vcpus' TSCs (though the + * TSCs do not drift from each other). Also, the ACPI PM timer clocksource + * is not suitable as a watchdog when running on a hypervisor because the + * kernel may miss a wrap of the counter if the vcpu is descheduled for a + * long time. To skip these checks at runtime we set these capability bits, + * so that the kernel could just trust the hypervisor with providing a + * reliable virtual TSC that is suitable for timekeeping. + */ +void __cpuinit vmware_set_feature_bits(struct cpuinfo_x86 *c) +{ + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); +} diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 26855381790..d84a852e4cd 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -29,34 +29,17 @@ #include <mach_ipi.h> -/* This keeps a track of which one is crashing cpu. */ -static int crashing_cpu; #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) -static atomic_t waiting_for_crash_ipi; -static int crash_nmi_callback(struct notifier_block *self, - unsigned long val, void *data) +static void kdump_nmi_callback(int cpu, struct die_args *args) { struct pt_regs *regs; #ifdef CONFIG_X86_32 struct pt_regs fixed_regs; #endif - int cpu; - if (val != DIE_NMI_IPI) - return NOTIFY_OK; - - regs = ((struct die_args *)data)->regs; - cpu = raw_smp_processor_id(); - - /* Don't do anything if this handler is invoked on crashing cpu. - * Otherwise, system will completely hang. Crashing cpu can get - * an NMI if system was initially booted with nmi_watchdog parameter. - */ - if (cpu == crashing_cpu) - return NOTIFY_STOP; - local_irq_disable(); + regs = args->regs; #ifdef CONFIG_X86_32 if (!user_mode_vm(regs)) { @@ -65,54 +48,19 @@ static int crash_nmi_callback(struct notifier_block *self, } #endif crash_save_cpu(regs, cpu); - disable_local_APIC(); - atomic_dec(&waiting_for_crash_ipi); - /* Assume hlt works */ - halt(); - for (;;) - cpu_relax(); - - return 1; -} -static void smp_send_nmi_allbutself(void) -{ - cpumask_t mask = cpu_online_map; - cpu_clear(safe_smp_processor_id(), mask); - if (!cpus_empty(mask)) - send_IPI_mask(mask, NMI_VECTOR); + disable_local_APIC(); } -static struct notifier_block crash_nmi_nb = { - .notifier_call = crash_nmi_callback, -}; - -static void nmi_shootdown_cpus(void) +static void kdump_nmi_shootdown_cpus(void) { - unsigned long msecs; - - atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); - /* Would it be better to replace the trap vector here? */ - if (register_die_notifier(&crash_nmi_nb)) - return; /* return what? */ - /* Ensure the new callback function is set before sending - * out the NMI - */ - wmb(); + nmi_shootdown_cpus(kdump_nmi_callback); - smp_send_nmi_allbutself(); - - msecs = 1000; /* Wait at most a second for the other cpus to stop */ - while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) { - mdelay(1); - msecs--; - } - - /* Leave the nmi callback set */ disable_local_APIC(); } + #else -static void nmi_shootdown_cpus(void) +static void kdump_nmi_shootdown_cpus(void) { /* There are no cpus to shootdown */ } @@ -131,9 +79,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) /* The kernel is broken so disable interrupts */ local_irq_disable(); - /* Make a note of crashing cpu. Will be used in NMI callback.*/ - crashing_cpu = safe_smp_processor_id(); - nmi_shootdown_cpus(); + kdump_nmi_shootdown_cpus(); lapic_shutdown(); #if defined(CONFIG_X86_IO_APIC) disable_IO_APIC(); diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index a2d1176c38e..da91701a234 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -6,14 +6,13 @@ * precise-event based sampling (PEBS). * * It manages: - * - per-thread and per-cpu allocation of BTS and PEBS - * - buffer memory allocation (optional) - * - buffer overflow handling + * - DS and BTS hardware configuration + * - buffer overflow handling (to be done) * - buffer access * - * It assumes: - * - get_task_struct on all parameter tasks - * - current is allowed to trace parameter tasks + * It does not do: + * - security checking (is the caller allowed to trace the task) + * - buffer allocation (memory accounting) * * * Copyright (C) 2007-2008 Intel Corporation. @@ -28,22 +27,69 @@ #include <linux/slab.h> #include <linux/sched.h> #include <linux/mm.h> +#include <linux/kernel.h> /* * The configuration for a particular DS hardware implementation. */ struct ds_configuration { - /* the size of the DS structure in bytes */ - unsigned char sizeof_ds; - /* the size of one pointer-typed field in the DS structure in bytes; - this covers the first 8 fields related to buffer management. */ + /* the name of the configuration */ + const char *name; + /* the size of one pointer-typed field in the DS structure and + in the BTS and PEBS buffers in bytes; + this covers the first 8 DS fields related to buffer management. */ unsigned char sizeof_field; /* the size of a BTS/PEBS record in bytes */ unsigned char sizeof_rec[2]; + /* a series of bit-masks to control various features indexed + * by enum ds_feature */ + unsigned long ctl[dsf_ctl_max]; }; -static struct ds_configuration ds_cfg; +static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); +#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id()) + +#define MAX_SIZEOF_DS (12 * 8) /* maximal size of a DS configuration */ +#define MAX_SIZEOF_BTS (3 * 8) /* maximal size of a BTS record */ +#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */ + +#define BTS_CONTROL \ + (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\ + ds_cfg.ctl[dsf_bts_overflow]) + + +/* + * A BTS or PEBS tracer. + * + * This holds the configuration of the tracer and serves as a handle + * to identify tracers. + */ +struct ds_tracer { + /* the DS context (partially) owned by this tracer */ + struct ds_context *context; + /* the buffer provided on ds_request() and its size in bytes */ + void *buffer; + size_t size; +}; + +struct bts_tracer { + /* the common DS part */ + struct ds_tracer ds; + /* the trace including the DS configuration */ + struct bts_trace trace; + /* buffer overflow notification function */ + bts_ovfl_callback_t ovfl; +}; + +struct pebs_tracer { + /* the common DS part */ + struct ds_tracer ds; + /* the trace including the DS configuration */ + struct pebs_trace trace; + /* buffer overflow notification function */ + pebs_ovfl_callback_t ovfl; +}; /* * Debug Store (DS) save area configuration (see Intel64 and IA32 @@ -109,32 +155,9 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual, /* - * Locking is done only for allocating BTS or PEBS resources and for - * guarding context and buffer memory allocation. - * - * Most functions require the current task to own the ds context part - * they are going to access. All the locking is done when validating - * access to the context. + * Locking is done only for allocating BTS or PEBS resources. */ -static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock); - -/* - * Validate that the current task is allowed to access the BTS/PEBS - * buffer of the parameter task. - * - * Returns 0, if access is granted; -Eerrno, otherwise. - */ -static inline int ds_validate_access(struct ds_context *context, - enum ds_qualifier qual) -{ - if (!context) - return -EPERM; - - if (context->owner[qual] == current) - return 0; - - return -EPERM; -} +static DEFINE_SPINLOCK(ds_lock); /* @@ -150,27 +173,32 @@ static inline int ds_validate_access(struct ds_context *context, * >0 number of per-thread tracers * <0 number of per-cpu tracers * - * The below functions to get and put tracers and to check the - * allocation type require the ds_lock to be held by the caller. - * * Tracers essentially gives the number of ds contexts for a certain * type of allocation. */ -static long tracers; +static atomic_t tracers = ATOMIC_INIT(0); static inline void get_tracer(struct task_struct *task) { - tracers += (task ? 1 : -1); + if (task) + atomic_inc(&tracers); + else + atomic_dec(&tracers); } static inline void put_tracer(struct task_struct *task) { - tracers -= (task ? 1 : -1); + if (task) + atomic_dec(&tracers); + else + atomic_inc(&tracers); } static inline int check_tracer(struct task_struct *task) { - return (task ? (tracers >= 0) : (tracers <= 0)); + return task ? + (atomic_read(&tracers) >= 0) : + (atomic_read(&tracers) <= 0); } @@ -183,99 +211,70 @@ static inline int check_tracer(struct task_struct *task) * * Contexts are use-counted. They are allocated on first access and * deallocated when the last user puts the context. - * - * We distinguish between an allocating and a non-allocating get of a - * context: - * - the allocating get is used for requesting BTS/PEBS resources. It - * requires the caller to hold the global ds_lock. - * - the non-allocating get is used for all other cases. A - * non-existing context indicates an error. It acquires and releases - * the ds_lock itself for obtaining the context. - * - * A context and its DS configuration are allocated and deallocated - * together. A context always has a DS configuration of the - * appropriate size. - */ -static DEFINE_PER_CPU(struct ds_context *, system_context); - -#define this_system_context per_cpu(system_context, smp_processor_id()) - -/* - * Returns the pointer to the parameter task's context or to the - * system-wide context, if task is NULL. - * - * Increases the use count of the returned context, if not NULL. */ -static inline struct ds_context *ds_get_context(struct task_struct *task) -{ - struct ds_context *context; - unsigned long irq; +struct ds_context { + /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */ + unsigned char ds[MAX_SIZEOF_DS]; + /* the owner of the BTS and PEBS configuration, respectively */ + struct bts_tracer *bts_master; + struct pebs_tracer *pebs_master; + /* use count */ + unsigned long count; + /* a pointer to the context location inside the thread_struct + * or the per_cpu context array */ + struct ds_context **this; + /* a pointer to the task owning this context, or NULL, if the + * context is owned by a cpu */ + struct task_struct *task; +}; - spin_lock_irqsave(&ds_lock, irq); +static DEFINE_PER_CPU(struct ds_context *, system_context_array); - context = (task ? task->thread.ds_ctx : this_system_context); - if (context) - context->count++; +#define system_context per_cpu(system_context_array, smp_processor_id()) - spin_unlock_irqrestore(&ds_lock, irq); - - return context; -} -/* - * Same as ds_get_context, but allocates the context and it's DS - * structure, if necessary; returns NULL; if out of memory. - */ -static inline struct ds_context *ds_alloc_context(struct task_struct *task) +static inline struct ds_context *ds_get_context(struct task_struct *task) { struct ds_context **p_context = - (task ? &task->thread.ds_ctx : &this_system_context); - struct ds_context *context = *p_context; + (task ? &task->thread.ds_ctx : &system_context); + struct ds_context *context = NULL; + struct ds_context *new_context = NULL; unsigned long irq; - if (!context) { - context = kzalloc(sizeof(*context), GFP_KERNEL); - if (!context) - return NULL; - - context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL); - if (!context->ds) { - kfree(context); - return NULL; - } + /* Chances are small that we already have a context. */ + new_context = kzalloc(sizeof(*new_context), GFP_KERNEL); + if (!new_context) + return NULL; - spin_lock_irqsave(&ds_lock, irq); + spin_lock_irqsave(&ds_lock, irq); - if (*p_context) { - kfree(context->ds); - kfree(context); + context = *p_context; + if (!context) { + context = new_context; - context = *p_context; - } else { - *p_context = context; + context->this = p_context; + context->task = task; + context->count = 0; - context->this = p_context; - context->task = task; + if (task) + set_tsk_thread_flag(task, TIF_DS_AREA_MSR); - if (task) - set_tsk_thread_flag(task, TIF_DS_AREA_MSR); + if (!task || (task == current)) + wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds); - if (!task || (task == current)) - wrmsrl(MSR_IA32_DS_AREA, - (unsigned long)context->ds); - } - spin_unlock_irqrestore(&ds_lock, irq); + *p_context = context; } context->count++; + spin_unlock_irqrestore(&ds_lock, irq); + + if (context != new_context) + kfree(new_context); + return context; } -/* - * Decreases the use count of the parameter context, if not NULL. - * Deallocates the context, if the use count reaches zero. - */ static inline void ds_put_context(struct ds_context *context) { unsigned long irq; @@ -285,8 +284,10 @@ static inline void ds_put_context(struct ds_context *context) spin_lock_irqsave(&ds_lock, irq); - if (--context->count) - goto out; + if (--context->count) { + spin_unlock_irqrestore(&ds_lock, irq); + return; + } *(context->this) = NULL; @@ -296,135 +297,263 @@ static inline void ds_put_context(struct ds_context *context) if (!context->task || (context->task == current)) wrmsrl(MSR_IA32_DS_AREA, 0); - put_tracer(context->task); + spin_unlock_irqrestore(&ds_lock, irq); - /* free any leftover buffers from tracers that did not - * deallocate them properly. */ - kfree(context->buffer[ds_bts]); - kfree(context->buffer[ds_pebs]); - kfree(context->ds); kfree(context); - out: - spin_unlock_irqrestore(&ds_lock, irq); } /* - * Handle a buffer overflow + * Call the tracer's callback on a buffer overflow. * - * task: the task whose buffers are overflowing; - * NULL for a buffer overflow on the current cpu * context: the ds context * qual: the buffer type */ -static void ds_overflow(struct task_struct *task, struct ds_context *context, - enum ds_qualifier qual) +static void ds_overflow(struct ds_context *context, enum ds_qualifier qual) { - if (!context) - return; - - if (context->callback[qual]) - (*context->callback[qual])(task); - - /* todo: do some more overflow handling */ + switch (qual) { + case ds_bts: + if (context->bts_master && + context->bts_master->ovfl) + context->bts_master->ovfl(context->bts_master); + break; + case ds_pebs: + if (context->pebs_master && + context->pebs_master->ovfl) + context->pebs_master->ovfl(context->pebs_master); + break; + } } /* - * Allocate a non-pageable buffer of the parameter size. - * Checks the memory and the locked memory rlimit. + * Write raw data into the BTS or PEBS buffer. * - * Returns the buffer, if successful; - * NULL, if out of memory or rlimit exceeded. + * The remainder of any partially written record is zeroed out. * - * size: the requested buffer size in bytes - * pages (out): if not NULL, contains the number of pages reserved + * context: the DS context + * qual: the buffer type + * record: the data to write + * size: the size of the data */ -static inline void *ds_allocate_buffer(size_t size, unsigned int *pages) +static int ds_write(struct ds_context *context, enum ds_qualifier qual, + const void *record, size_t size) { - unsigned long rlim, vm, pgsz; - void *buffer; + int bytes_written = 0; - pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; + if (!record) + return -EINVAL; - rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; - vm = current->mm->total_vm + pgsz; - if (rlim < vm) - return NULL; + while (size) { + unsigned long base, index, end, write_end, int_th; + unsigned long write_size, adj_write_size; - rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; - vm = current->mm->locked_vm + pgsz; - if (rlim < vm) - return NULL; + /* + * write as much as possible without producing an + * overflow interrupt. + * + * interrupt_threshold must either be + * - bigger than absolute_maximum or + * - point to a record between buffer_base and absolute_maximum + * + * index points to a valid record. + */ + base = ds_get(context->ds, qual, ds_buffer_base); + index = ds_get(context->ds, qual, ds_index); + end = ds_get(context->ds, qual, ds_absolute_maximum); + int_th = ds_get(context->ds, qual, ds_interrupt_threshold); - buffer = kzalloc(size, GFP_KERNEL); - if (!buffer) - return NULL; + write_end = min(end, int_th); - current->mm->total_vm += pgsz; - current->mm->locked_vm += pgsz; + /* if we are already beyond the interrupt threshold, + * we fill the entire buffer */ + if (write_end <= index) + write_end = end; - if (pages) - *pages = pgsz; + if (write_end <= index) + break; + + write_size = min((unsigned long) size, write_end - index); + memcpy((void *)index, record, write_size); - return buffer; + record = (const char *)record + write_size; + size -= write_size; + bytes_written += write_size; + + adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; + adj_write_size *= ds_cfg.sizeof_rec[qual]; + + /* zero out trailing bytes */ + memset((char *)index + write_size, 0, + adj_write_size - write_size); + index += adj_write_size; + + if (index >= end) + index = base; + ds_set(context->ds, qual, ds_index, index); + + if (index >= int_th) + ds_overflow(context, qual); + } + + return bytes_written; } -static int ds_request(struct task_struct *task, void *base, size_t size, - ds_ovfl_callback_t ovfl, enum ds_qualifier qual) + +/* + * Branch Trace Store (BTS) uses the following format. Different + * architectures vary in the size of those fields. + * - source linear address + * - destination linear address + * - flags + * + * Later architectures use 64bit pointers throughout, whereas earlier + * architectures use 32bit pointers in 32bit mode. + * + * We compute the base address for the first 8 fields based on: + * - the field size stored in the DS configuration + * - the relative field position + * + * In order to store additional information in the BTS buffer, we use + * a special source address to indicate that the record requires + * special interpretation. + * + * Netburst indicated via a bit in the flags field whether the branch + * was predicted; this is ignored. + * + * We use two levels of abstraction: + * - the raw data level defined here + * - an arch-independent level defined in ds.h + */ + +enum bts_field { + bts_from, + bts_to, + bts_flags, + + bts_qual = bts_from, + bts_jiffies = bts_to, + bts_pid = bts_flags, + + bts_qual_mask = (bts_qual_max - 1), + bts_escape = ((unsigned long)-1 & ~bts_qual_mask) +}; + +static inline unsigned long bts_get(const char *base, enum bts_field field) { - struct ds_context *context; - unsigned long buffer, adj; - const unsigned long alignment = (1 << 3); - unsigned long irq; - int error = 0; + base += (ds_cfg.sizeof_field * field); + return *(unsigned long *)base; +} + +static inline void bts_set(char *base, enum bts_field field, unsigned long val) +{ + base += (ds_cfg.sizeof_field * field);; + (*(unsigned long *)base) = val; +} - if (!ds_cfg.sizeof_ds) - return -EOPNOTSUPP; - /* we require some space to do alignment adjustments below */ - if (size < (alignment + ds_cfg.sizeof_rec[qual])) +/* + * The raw BTS data is architecture dependent. + * + * For higher-level users, we give an arch-independent view. + * - ds.h defines struct bts_struct + * - bts_read translates one raw bts record into a bts_struct + * - bts_write translates one bts_struct into the raw format and + * writes it into the top of the parameter tracer's buffer. + * + * return: bytes read/written on success; -Eerrno, otherwise + */ +static int bts_read(struct bts_tracer *tracer, const void *at, + struct bts_struct *out) +{ + if (!tracer) return -EINVAL; - /* buffer overflow notification is not yet implemented */ - if (ovfl) - return -EOPNOTSUPP; + if (at < tracer->trace.ds.begin) + return -EINVAL; + if (tracer->trace.ds.end < (at + tracer->trace.ds.size)) + return -EINVAL; - context = ds_alloc_context(task); - if (!context) - return -ENOMEM; + memset(out, 0, sizeof(*out)); + if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) { + out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask); + out->variant.timestamp.jiffies = bts_get(at, bts_jiffies); + out->variant.timestamp.pid = bts_get(at, bts_pid); + } else { + out->qualifier = bts_branch; + out->variant.lbr.from = bts_get(at, bts_from); + out->variant.lbr.to = bts_get(at, bts_to); + + if (!out->variant.lbr.from && !out->variant.lbr.to) + out->qualifier = bts_invalid; + } - spin_lock_irqsave(&ds_lock, irq); + return ds_cfg.sizeof_rec[ds_bts]; +} - error = -EPERM; - if (!check_tracer(task)) - goto out_unlock; +static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in) +{ + unsigned char raw[MAX_SIZEOF_BTS]; - get_tracer(task); + if (!tracer) + return -EINVAL; - error = -EALREADY; - if (context->owner[qual] == current) - goto out_put_tracer; - error = -EPERM; - if (context->owner[qual] != NULL) - goto out_put_tracer; - context->owner[qual] = current; + if (MAX_SIZEOF_BTS < ds_cfg.sizeof_rec[ds_bts]) + return -EOVERFLOW; - spin_unlock_irqrestore(&ds_lock, irq); + switch (in->qualifier) { + case bts_invalid: + bts_set(raw, bts_from, 0); + bts_set(raw, bts_to, 0); + bts_set(raw, bts_flags, 0); + break; + case bts_branch: + bts_set(raw, bts_from, in->variant.lbr.from); + bts_set(raw, bts_to, in->variant.lbr.to); + bts_set(raw, bts_flags, 0); + break; + case bts_task_arrives: + case bts_task_departs: + bts_set(raw, bts_qual, (bts_escape | in->qualifier)); + bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies); + bts_set(raw, bts_pid, in->variant.timestamp.pid); + break; + default: + return -EINVAL; + } + return ds_write(tracer->ds.context, ds_bts, raw, + ds_cfg.sizeof_rec[ds_bts]); +} - error = -ENOMEM; - if (!base) { - base = ds_allocate_buffer(size, &context->pages[qual]); - if (!base) - goto out_release; - context->buffer[qual] = base; - } - error = 0; +static void ds_write_config(struct ds_context *context, + struct ds_trace *cfg, enum ds_qualifier qual) +{ + unsigned char *ds = context->ds; + + ds_set(ds, qual, ds_buffer_base, (unsigned long)cfg->begin); + ds_set(ds, qual, ds_index, (unsigned long)cfg->top); + ds_set(ds, qual, ds_absolute_maximum, (unsigned long)cfg->end); + ds_set(ds, qual, ds_interrupt_threshold, (unsigned long)cfg->ith); +} + +static void ds_read_config(struct ds_context *context, + struct ds_trace *cfg, enum ds_qualifier qual) +{ + unsigned char *ds = context->ds; - context->callback[qual] = ovfl; + cfg->begin = (void *)ds_get(ds, qual, ds_buffer_base); + cfg->top = (void *)ds_get(ds, qual, ds_index); + cfg->end = (void *)ds_get(ds, qual, ds_absolute_maximum); + cfg->ith = (void *)ds_get(ds, qual, ds_interrupt_threshold); +} + +static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual, + void *base, size_t size, size_t ith, + unsigned int flags) { + unsigned long buffer, adj; /* adjust the buffer address and size to meet alignment * constraints: @@ -436,410 +565,383 @@ static int ds_request(struct task_struct *task, void *base, size_t size, */ buffer = (unsigned long)base; - adj = ALIGN(buffer, alignment) - buffer; + adj = ALIGN(buffer, DS_ALIGNMENT) - buffer; buffer += adj; size -= adj; - size /= ds_cfg.sizeof_rec[qual]; - size *= ds_cfg.sizeof_rec[qual]; - - ds_set(context->ds, qual, ds_buffer_base, buffer); - ds_set(context->ds, qual, ds_index, buffer); - ds_set(context->ds, qual, ds_absolute_maximum, buffer + size); + trace->n = size / ds_cfg.sizeof_rec[qual]; + trace->size = ds_cfg.sizeof_rec[qual]; - if (ovfl) { - /* todo: select a suitable interrupt threshold */ - } else - ds_set(context->ds, qual, - ds_interrupt_threshold, buffer + size + 1); + size = (trace->n * trace->size); - /* we keep the context until ds_release */ - return error; - - out_release: - context->owner[qual] = NULL; - ds_put_context(context); - put_tracer(task); - return error; - - out_put_tracer: - spin_unlock_irqrestore(&ds_lock, irq); - ds_put_context(context); - put_tracer(task); - return error; + trace->begin = (void *)buffer; + trace->top = trace->begin; + trace->end = (void *)(buffer + size); + /* The value for 'no threshold' is -1, which will set the + * threshold outside of the buffer, just like we want it. + */ + trace->ith = (void *)(buffer + size - ith); - out_unlock: - spin_unlock_irqrestore(&ds_lock, irq); - ds_put_context(context); - return error; + trace->flags = flags; } -int ds_request_bts(struct task_struct *task, void *base, size_t size, - ds_ovfl_callback_t ovfl) -{ - return ds_request(task, base, size, ovfl, ds_bts); -} -int ds_request_pebs(struct task_struct *task, void *base, size_t size, - ds_ovfl_callback_t ovfl) -{ - return ds_request(task, base, size, ovfl, ds_pebs); -} - -static int ds_release(struct task_struct *task, enum ds_qualifier qual) +static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, + enum ds_qualifier qual, struct task_struct *task, + void *base, size_t size, size_t th, unsigned int flags) { struct ds_context *context; int error; - context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) + error = -EINVAL; + if (!base) goto out; - kfree(context->buffer[qual]); - context->buffer[qual] = NULL; - - current->mm->total_vm -= context->pages[qual]; - current->mm->locked_vm -= context->pages[qual]; - context->pages[qual] = 0; - context->owner[qual] = NULL; - - /* - * we put the context twice: - * once for the ds_get_context - * once for the corresponding ds_request - */ - ds_put_context(context); - out: - ds_put_context(context); - return error; -} + /* we require some space to do alignment adjustments below */ + error = -EINVAL; + if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual])) + goto out; -int ds_release_bts(struct task_struct *task) -{ - return ds_release(task, ds_bts); -} + if (th != (size_t)-1) { + th *= ds_cfg.sizeof_rec[qual]; -int ds_release_pebs(struct task_struct *task) -{ - return ds_release(task, ds_pebs); -} + error = -EINVAL; + if (size <= th) + goto out; + } -static int ds_get_index(struct task_struct *task, size_t *pos, - enum ds_qualifier qual) -{ - struct ds_context *context; - unsigned long base, index; - int error; + tracer->buffer = base; + tracer->size = size; + error = -ENOMEM; context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) + if (!context) goto out; + tracer->context = context; - base = ds_get(context->ds, qual, ds_buffer_base); - index = ds_get(context->ds, qual, ds_index); + ds_init_ds_trace(trace, qual, base, size, th, flags); - error = ((index - base) / ds_cfg.sizeof_rec[qual]); - if (pos) - *pos = error; + error = 0; out: - ds_put_context(context); return error; } -int ds_get_bts_index(struct task_struct *task, size_t *pos) -{ - return ds_get_index(task, pos, ds_bts); -} - -int ds_get_pebs_index(struct task_struct *task, size_t *pos) +struct bts_tracer *ds_request_bts(struct task_struct *task, + void *base, size_t size, + bts_ovfl_callback_t ovfl, size_t th, + unsigned int flags) { - return ds_get_index(task, pos, ds_pebs); -} - -static int ds_get_end(struct task_struct *task, size_t *pos, - enum ds_qualifier qual) -{ - struct ds_context *context; - unsigned long base, end; + struct bts_tracer *tracer; + unsigned long irq; int error; - context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) + error = -EOPNOTSUPP; + if (!ds_cfg.ctl[dsf_bts]) goto out; - base = ds_get(context->ds, qual, ds_buffer_base); - end = ds_get(context->ds, qual, ds_absolute_maximum); + /* buffer overflow notification is not yet implemented */ + error = -EOPNOTSUPP; + if (ovfl) + goto out; - error = ((end - base) / ds_cfg.sizeof_rec[qual]); - if (pos) - *pos = error; - out: - ds_put_context(context); - return error; -} + error = -ENOMEM; + tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); + if (!tracer) + goto out; + tracer->ovfl = ovfl; -int ds_get_bts_end(struct task_struct *task, size_t *pos) -{ - return ds_get_end(task, pos, ds_bts); -} + error = ds_request(&tracer->ds, &tracer->trace.ds, + ds_bts, task, base, size, th, flags); + if (error < 0) + goto out_tracer; -int ds_get_pebs_end(struct task_struct *task, size_t *pos) -{ - return ds_get_end(task, pos, ds_pebs); -} -static int ds_access(struct task_struct *task, size_t index, - const void **record, enum ds_qualifier qual) -{ - struct ds_context *context; - unsigned long base, idx; - int error; + spin_lock_irqsave(&ds_lock, irq); - if (!record) - return -EINVAL; + error = -EPERM; + if (!check_tracer(task)) + goto out_unlock; + get_tracer(task); - context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) - goto out; + error = -EPERM; + if (tracer->ds.context->bts_master) + goto out_put_tracer; + tracer->ds.context->bts_master = tracer; - base = ds_get(context->ds, qual, ds_buffer_base); - idx = base + (index * ds_cfg.sizeof_rec[qual]); + spin_unlock_irqrestore(&ds_lock, irq); - error = -EINVAL; - if (idx > ds_get(context->ds, qual, ds_absolute_maximum)) - goto out; - *record = (const void *)idx; - error = ds_cfg.sizeof_rec[qual]; - out: - ds_put_context(context); - return error; -} + tracer->trace.read = bts_read; + tracer->trace.write = bts_write; -int ds_access_bts(struct task_struct *task, size_t index, const void **record) -{ - return ds_access(task, index, record, ds_bts); -} + ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + ds_resume_bts(tracer); -int ds_access_pebs(struct task_struct *task, size_t index, const void **record) -{ - return ds_access(task, index, record, ds_pebs); + return tracer; + + out_put_tracer: + put_tracer(task); + out_unlock: + spin_unlock_irqrestore(&ds_lock, irq); + ds_put_context(tracer->ds.context); + out_tracer: + kfree(tracer); + out: + return ERR_PTR(error); } -static int ds_write(struct task_struct *task, const void *record, size_t size, - enum ds_qualifier qual, int force) +struct pebs_tracer *ds_request_pebs(struct task_struct *task, + void *base, size_t size, + pebs_ovfl_callback_t ovfl, size_t th, + unsigned int flags) { - struct ds_context *context; + struct pebs_tracer *tracer; + unsigned long irq; int error; - if (!record) - return -EINVAL; + /* buffer overflow notification is not yet implemented */ + error = -EOPNOTSUPP; + if (ovfl) + goto out; - error = -EPERM; - context = ds_get_context(task); - if (!context) + error = -ENOMEM; + tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); + if (!tracer) goto out; + tracer->ovfl = ovfl; - if (!force) { - error = ds_validate_access(context, qual); - if (error < 0) - goto out; - } + error = ds_request(&tracer->ds, &tracer->trace.ds, + ds_pebs, task, base, size, th, flags); + if (error < 0) + goto out_tracer; - error = 0; - while (size) { - unsigned long base, index, end, write_end, int_th; - unsigned long write_size, adj_write_size; + spin_lock_irqsave(&ds_lock, irq); - /* - * write as much as possible without producing an - * overflow interrupt. - * - * interrupt_threshold must either be - * - bigger than absolute_maximum or - * - point to a record between buffer_base and absolute_maximum - * - * index points to a valid record. - */ - base = ds_get(context->ds, qual, ds_buffer_base); - index = ds_get(context->ds, qual, ds_index); - end = ds_get(context->ds, qual, ds_absolute_maximum); - int_th = ds_get(context->ds, qual, ds_interrupt_threshold); + error = -EPERM; + if (!check_tracer(task)) + goto out_unlock; + get_tracer(task); - write_end = min(end, int_th); + error = -EPERM; + if (tracer->ds.context->pebs_master) + goto out_put_tracer; + tracer->ds.context->pebs_master = tracer; - /* if we are already beyond the interrupt threshold, - * we fill the entire buffer */ - if (write_end <= index) - write_end = end; + spin_unlock_irqrestore(&ds_lock, irq); - if (write_end <= index) - goto out; + ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + ds_resume_pebs(tracer); - write_size = min((unsigned long) size, write_end - index); - memcpy((void *)index, record, write_size); + return tracer; - record = (const char *)record + write_size; - size -= write_size; - error += write_size; + out_put_tracer: + put_tracer(task); + out_unlock: + spin_unlock_irqrestore(&ds_lock, irq); + ds_put_context(tracer->ds.context); + out_tracer: + kfree(tracer); + out: + return ERR_PTR(error); +} - adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; - adj_write_size *= ds_cfg.sizeof_rec[qual]; +void ds_release_bts(struct bts_tracer *tracer) +{ + if (!tracer) + return; - /* zero out trailing bytes */ - memset((char *)index + write_size, 0, - adj_write_size - write_size); - index += adj_write_size; + ds_suspend_bts(tracer); - if (index >= end) - index = base; - ds_set(context->ds, qual, ds_index, index); + WARN_ON_ONCE(tracer->ds.context->bts_master != tracer); + tracer->ds.context->bts_master = NULL; - if (index >= int_th) - ds_overflow(task, context, qual); - } + put_tracer(tracer->ds.context->task); + ds_put_context(tracer->ds.context); - out: - ds_put_context(context); - return error; + kfree(tracer); } -int ds_write_bts(struct task_struct *task, const void *record, size_t size) +void ds_suspend_bts(struct bts_tracer *tracer) { - return ds_write(task, record, size, ds_bts, /* force = */ 0); -} + struct task_struct *task; -int ds_write_pebs(struct task_struct *task, const void *record, size_t size) -{ - return ds_write(task, record, size, ds_pebs, /* force = */ 0); -} + if (!tracer) + return; -int ds_unchecked_write_bts(struct task_struct *task, - const void *record, size_t size) -{ - return ds_write(task, record, size, ds_bts, /* force = */ 1); -} + task = tracer->ds.context->task; -int ds_unchecked_write_pebs(struct task_struct *task, - const void *record, size_t size) -{ - return ds_write(task, record, size, ds_pebs, /* force = */ 1); + if (!task || (task == current)) + update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL); + + if (task) { + task->thread.debugctlmsr &= ~BTS_CONTROL; + + if (!task->thread.debugctlmsr) + clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR); + } } -static int ds_reset_or_clear(struct task_struct *task, - enum ds_qualifier qual, int clear) +void ds_resume_bts(struct bts_tracer *tracer) { - struct ds_context *context; - unsigned long base, end; - int error; + struct task_struct *task; + unsigned long control; - context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) - goto out; + if (!tracer) + return; - base = ds_get(context->ds, qual, ds_buffer_base); - end = ds_get(context->ds, qual, ds_absolute_maximum); + task = tracer->ds.context->task; - if (clear) - memset((void *)base, 0, end - base); + control = ds_cfg.ctl[dsf_bts]; + if (!(tracer->trace.ds.flags & BTS_KERNEL)) + control |= ds_cfg.ctl[dsf_bts_kernel]; + if (!(tracer->trace.ds.flags & BTS_USER)) + control |= ds_cfg.ctl[dsf_bts_user]; - ds_set(context->ds, qual, ds_index, base); + if (task) { + task->thread.debugctlmsr |= control; + set_tsk_thread_flag(task, TIF_DEBUGCTLMSR); + } - error = 0; - out: - ds_put_context(context); - return error; + if (!task || (task == current)) + update_debugctlmsr(get_debugctlmsr() | control); } -int ds_reset_bts(struct task_struct *task) +void ds_release_pebs(struct pebs_tracer *tracer) { - return ds_reset_or_clear(task, ds_bts, /* clear = */ 0); + if (!tracer) + return; + + ds_suspend_pebs(tracer); + + WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer); + tracer->ds.context->pebs_master = NULL; + + put_tracer(tracer->ds.context->task); + ds_put_context(tracer->ds.context); + + kfree(tracer); } -int ds_reset_pebs(struct task_struct *task) +void ds_suspend_pebs(struct pebs_tracer *tracer) { - return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0); + } -int ds_clear_bts(struct task_struct *task) +void ds_resume_pebs(struct pebs_tracer *tracer) { - return ds_reset_or_clear(task, ds_bts, /* clear = */ 1); + } -int ds_clear_pebs(struct task_struct *task) +const struct bts_trace *ds_read_bts(struct bts_tracer *tracer) { - return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1); + if (!tracer) + return NULL; + + ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + return &tracer->trace; } -int ds_get_pebs_reset(struct task_struct *task, u64 *value) +const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer) { - struct ds_context *context; - int error; + if (!tracer) + return NULL; + + ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); + tracer->trace.reset_value = + *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)); - if (!value) + return &tracer->trace; +} + +int ds_reset_bts(struct bts_tracer *tracer) +{ + if (!tracer) return -EINVAL; - context = ds_get_context(task); - error = ds_validate_access(context, ds_pebs); - if (error < 0) - goto out; + tracer->trace.ds.top = tracer->trace.ds.begin; - *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)); + ds_set(tracer->ds.context->ds, ds_bts, ds_index, + (unsigned long)tracer->trace.ds.top); - error = 0; - out: - ds_put_context(context); - return error; + return 0; } -int ds_set_pebs_reset(struct task_struct *task, u64 value) +int ds_reset_pebs(struct pebs_tracer *tracer) { - struct ds_context *context; - int error; + if (!tracer) + return -EINVAL; - context = ds_get_context(task); - error = ds_validate_access(context, ds_pebs); - if (error < 0) - goto out; + tracer->trace.ds.top = tracer->trace.ds.begin; - *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value; + ds_set(tracer->ds.context->ds, ds_bts, ds_index, + (unsigned long)tracer->trace.ds.top); - error = 0; - out: - ds_put_context(context); - return error; + return 0; +} + +int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value) +{ + if (!tracer) + return -EINVAL; + + *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value; + + return 0; } -static const struct ds_configuration ds_cfg_var = { - .sizeof_ds = sizeof(long) * 12, - .sizeof_field = sizeof(long), - .sizeof_rec[ds_bts] = sizeof(long) * 3, +static const struct ds_configuration ds_cfg_netburst = { + .name = "netburst", + .ctl[dsf_bts] = (1 << 2) | (1 << 3), + .ctl[dsf_bts_kernel] = (1 << 5), + .ctl[dsf_bts_user] = (1 << 6), + + .sizeof_field = sizeof(long), + .sizeof_rec[ds_bts] = sizeof(long) * 3, #ifdef __i386__ - .sizeof_rec[ds_pebs] = sizeof(long) * 10 + .sizeof_rec[ds_pebs] = sizeof(long) * 10, #else - .sizeof_rec[ds_pebs] = sizeof(long) * 18 + .sizeof_rec[ds_pebs] = sizeof(long) * 18, #endif }; -static const struct ds_configuration ds_cfg_64 = { - .sizeof_ds = 8 * 12, - .sizeof_field = 8, - .sizeof_rec[ds_bts] = 8 * 3, +static const struct ds_configuration ds_cfg_pentium_m = { + .name = "pentium m", + .ctl[dsf_bts] = (1 << 6) | (1 << 7), + + .sizeof_field = sizeof(long), + .sizeof_rec[ds_bts] = sizeof(long) * 3, #ifdef __i386__ - .sizeof_rec[ds_pebs] = 8 * 10 + .sizeof_rec[ds_pebs] = sizeof(long) * 10, #else - .sizeof_rec[ds_pebs] = 8 * 18 + .sizeof_rec[ds_pebs] = sizeof(long) * 18, #endif }; +static const struct ds_configuration ds_cfg_core2 = { + .name = "core 2", + .ctl[dsf_bts] = (1 << 6) | (1 << 7), + .ctl[dsf_bts_kernel] = (1 << 9), + .ctl[dsf_bts_user] = (1 << 10), + + .sizeof_field = 8, + .sizeof_rec[ds_bts] = 8 * 3, + .sizeof_rec[ds_pebs] = 8 * 18, +}; -static inline void +static void ds_configure(const struct ds_configuration *cfg) { + memset(&ds_cfg, 0, sizeof(ds_cfg)); ds_cfg = *cfg; + + printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name); + + if (!cpu_has_bts) { + ds_cfg.ctl[dsf_bts] = 0; + printk(KERN_INFO "[ds] bts not available\n"); + } + if (!cpu_has_pebs) + printk(KERN_INFO "[ds] pebs not available\n"); + + WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field)); } void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) @@ -847,16 +949,15 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) switch (c->x86) { case 0x6: switch (c->x86_model) { + case 0 ... 0xC: + /* sorry, don't know about them */ + break; case 0xD: case 0xE: /* Pentium M */ - ds_configure(&ds_cfg_var); + ds_configure(&ds_cfg_pentium_m); break; - case 0xF: /* Core2 */ - case 0x1C: /* Atom */ - ds_configure(&ds_cfg_64); - break; - default: - /* sorry, don't know about them */ + default: /* Core2, Atom, ... */ + ds_configure(&ds_cfg_core2); break; } break; @@ -865,7 +966,7 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) case 0x0: case 0x1: case 0x2: /* Netburst */ - ds_configure(&ds_cfg_var); + ds_configure(&ds_cfg_netburst); break; default: /* sorry, don't know about them */ @@ -878,12 +979,52 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) } } -void ds_free(struct ds_context *context) +/* + * Change the DS configuration from tracing prev to tracing next. + */ +void ds_switch_to(struct task_struct *prev, struct task_struct *next) +{ + struct ds_context *prev_ctx = prev->thread.ds_ctx; + struct ds_context *next_ctx = next->thread.ds_ctx; + + if (prev_ctx) { + update_debugctlmsr(0); + + if (prev_ctx->bts_master && + (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) { + struct bts_struct ts = { + .qualifier = bts_task_departs, + .variant.timestamp.jiffies = jiffies_64, + .variant.timestamp.pid = prev->pid + }; + bts_write(prev_ctx->bts_master, &ts); + } + } + + if (next_ctx) { + if (next_ctx->bts_master && + (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) { + struct bts_struct ts = { + .qualifier = bts_task_arrives, + .variant.timestamp.jiffies = jiffies_64, + .variant.timestamp.pid = next->pid + }; + bts_write(next_ctx->bts_master, &ts); + } + + wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds); + } + + update_debugctlmsr(next->thread.debugctlmsr); +} + +void ds_copy_thread(struct task_struct *tsk, struct task_struct *father) +{ + clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR); + tsk->thread.ds_ctx = NULL; +} + +void ds_exit_thread(struct task_struct *tsk) { - /* This is called when the task owning the parameter context - * is dying. There should not be any user of that context left - * to disturb us, anymore. */ - unsigned long leftovers = context->count; - while (leftovers--) - ds_put_context(context); + WARN_ON(tsk->thread.ds_ctx); } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c new file mode 100644 index 00000000000..6b1f6f6f866 --- /dev/null +++ b/arch/x86/kernel/dumpstack.c @@ -0,0 +1,351 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + */ +#include <linux/kallsyms.h> +#include <linux/kprobes.h> +#include <linux/uaccess.h> +#include <linux/utsname.h> +#include <linux/hardirq.h> +#include <linux/kdebug.h> +#include <linux/module.h> +#include <linux/ptrace.h> +#include <linux/kexec.h> +#include <linux/bug.h> +#include <linux/nmi.h> +#include <linux/sysfs.h> + +#include <asm/stacktrace.h> + +#include "dumpstack.h" + +int panic_on_unrecovered_nmi; +unsigned int code_bytes = 64; +int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; +static int die_counter; + +void printk_address(unsigned long address, int reliable) +{ + printk(" [<%p>] %s%pS\n", (void *) address, + reliable ? "" : "? ", (void *) address); +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static void +print_ftrace_graph_addr(unsigned long addr, void *data, + const struct stacktrace_ops *ops, + struct thread_info *tinfo, int *graph) +{ + struct task_struct *task = tinfo->task; + unsigned long ret_addr; + int index = task->curr_ret_stack; + + if (addr != (unsigned long)return_to_handler) + return; + + if (!task->ret_stack || index < *graph) + return; + + index -= *graph; + ret_addr = task->ret_stack[index].ret; + + ops->address(data, ret_addr, 1); + + (*graph)++; +} +#else +static inline void +print_ftrace_graph_addr(unsigned long addr, void *data, + const struct stacktrace_ops *ops, + struct thread_info *tinfo, int *graph) +{ } +#endif + +/* + * x86-64 can have up to three kernel stacks: + * process stack + * interrupt stack + * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack + */ + +static inline int valid_stack_ptr(struct thread_info *tinfo, + void *p, unsigned int size, void *end) +{ + void *t = tinfo; + if (end) { + if (p < end && p >= (end-THREAD_SIZE)) + return 1; + else + return 0; + } + return p > t && p < t + THREAD_SIZE - size; +} + +unsigned long +print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end, int *graph) +{ + struct stack_frame *frame = (struct stack_frame *)bp; + + while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { + unsigned long addr; + + addr = *stack; + if (__kernel_text_address(addr)) { + if ((unsigned long) stack == bp + sizeof(long)) { + ops->address(data, addr, 1); + frame = frame->next_frame; + bp = (unsigned long) frame; + } else { + ops->address(data, addr, bp == 0); + } + print_ftrace_graph_addr(addr, data, ops, tinfo, graph); + } + stack++; + } + return bp; +} + + +static void +print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + printk(data); + print_symbol(msg, symbol); + printk("\n"); +} + +static void print_trace_warning(void *data, char *msg) +{ + printk("%s%s\n", (char *)data, msg); +} + +static int print_trace_stack(void *data, char *name) +{ + printk("%s <%s> ", (char *)data, name); + return 0; +} + +/* + * Print one address/symbol entries per line. + */ +static void print_trace_address(void *data, unsigned long addr, int reliable) +{ + touch_nmi_watchdog(); + printk(data); + printk_address(addr, reliable); +} + +static const struct stacktrace_ops print_trace_ops = { + .warning = print_trace_warning, + .warning_symbol = print_trace_warning_symbol, + .stack = print_trace_stack, + .address = print_trace_address, +}; + +void +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, char *log_lvl) +{ + printk("%sCall Trace:\n", log_lvl); + dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); +} + +void show_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp) +{ + show_trace_log_lvl(task, regs, stack, bp, ""); +} + +void show_stack(struct task_struct *task, unsigned long *sp) +{ + show_stack_log_lvl(task, NULL, sp, 0, ""); +} + +/* + * The architecture-independent dump_stack generator + */ +void dump_stack(void) +{ + unsigned long bp = 0; + unsigned long stack; + +#ifdef CONFIG_FRAME_POINTER + if (!bp) + get_bp(bp); +#endif + + printk("Pid: %d, comm: %.20s %s %s %.*s\n", + current->pid, current->comm, print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + show_trace(NULL, NULL, &stack, bp); +} +EXPORT_SYMBOL(dump_stack); + +static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; +static int die_owner = -1; +static unsigned int die_nest_count; + +unsigned __kprobes long oops_begin(void) +{ + int cpu; + unsigned long flags; + + oops_enter(); + + /* racy, but better than risking deadlock. */ + raw_local_irq_save(flags); + cpu = smp_processor_id(); + if (!__raw_spin_trylock(&die_lock)) { + if (cpu == die_owner) + /* nested oops. should stop eventually */; + else + __raw_spin_lock(&die_lock); + } + die_nest_count++; + die_owner = cpu; + console_verbose(); + bust_spinlocks(1); + return flags; +} + +void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) +{ + if (regs && kexec_should_crash(current)) + crash_kexec(regs); + + bust_spinlocks(0); + die_owner = -1; + add_taint(TAINT_DIE); + die_nest_count--; + if (!die_nest_count) + /* Nest count reaches zero, release the lock. */ + __raw_spin_unlock(&die_lock); + raw_local_irq_restore(flags); + oops_exit(); + + if (!signr) + return; + if (in_interrupt()) + panic("Fatal exception in interrupt"); + if (panic_on_oops) + panic("Fatal exception"); + do_exit(signr); +} + +int __kprobes __die(const char *str, struct pt_regs *regs, long err) +{ +#ifdef CONFIG_X86_32 + unsigned short ss; + unsigned long sp; +#endif + printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); +#ifdef CONFIG_PREEMPT + printk("PREEMPT "); +#endif +#ifdef CONFIG_SMP + printk("SMP "); +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC + printk("DEBUG_PAGEALLOC"); +#endif + printk("\n"); + sysfs_printk_last_file(); + if (notify_die(DIE_OOPS, str, regs, err, + current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) + return 1; + + show_registers(regs); +#ifdef CONFIG_X86_32 + sp = (unsigned long) (®s->sp); + savesegment(ss, ss); + if (user_mode(regs)) { + sp = regs->sp; + ss = regs->ss & 0xffff; + } + printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); + print_symbol("%s", regs->ip); + printk(" SS:ESP %04x:%08lx\n", ss, sp); +#else + /* Executive summary in case the oops scrolled away */ + printk(KERN_ALERT "RIP "); + printk_address(regs->ip, 1); + printk(" RSP <%016lx>\n", regs->sp); +#endif + return 0; +} + +/* + * This is gone through when something in the kernel has done something bad + * and is about to be terminated: + */ +void die(const char *str, struct pt_regs *regs, long err) +{ + unsigned long flags = oops_begin(); + int sig = SIGSEGV; + + if (!user_mode_vm(regs)) + report_bug(regs->ip, regs); + + if (__die(str, regs, err)) + sig = 0; + oops_end(flags, regs, sig); +} + +void notrace __kprobes +die_nmi(char *str, struct pt_regs *regs, int do_panic) +{ + unsigned long flags; + + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) + return; + + /* + * We are in trouble anyway, lets at least try + * to get a message out. + */ + flags = oops_begin(); + printk(KERN_EMERG "%s", str); + printk(" on CPU%d, ip %08lx, registers:\n", + smp_processor_id(), regs->ip); + show_registers(regs); + oops_end(flags, regs, 0); + if (do_panic || panic_on_oops) + panic("Non maskable interrupt"); + nmi_exit(); + local_irq_enable(); + do_exit(SIGBUS); +} + +static int __init oops_setup(char *s) +{ + if (!s) + return -EINVAL; + if (!strcmp(s, "panic")) + panic_on_oops = 1; + return 0; +} +early_param("oops", oops_setup); + +static int __init kstack_setup(char *s) +{ + if (!s) + return -EINVAL; + kstack_depth_to_print = simple_strtoul(s, NULL, 0); + return 0; +} +early_param("kstack", kstack_setup); + +static int __init code_bytes_setup(char *s) +{ + code_bytes = simple_strtoul(s, NULL, 0); + if (code_bytes > 8192) + code_bytes = 8192; + + return 1; +} +__setup("code_bytes=", code_bytes_setup); diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h new file mode 100644 index 00000000000..da87590b869 --- /dev/null +++ b/arch/x86/kernel/dumpstack.h @@ -0,0 +1,39 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + */ + +#ifndef DUMPSTACK_H +#define DUMPSTACK_H + +#ifdef CONFIG_X86_32 +#define STACKSLOTS_PER_LINE 8 +#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) +#else +#define STACKSLOTS_PER_LINE 4 +#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) +#endif + +extern unsigned long +print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end, int *graph); + +extern void +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, char *log_lvl); + +extern void +show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, unsigned long bp, char *log_lvl); + +extern unsigned int code_bytes; +extern int kstack_depth_to_print; + +/* The form of the top of the frame on the stack */ +struct stack_frame { + struct stack_frame *next_frame; + unsigned long return_address; +}; +#endif diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index b3614752197..d593cd1f58d 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -17,69 +17,14 @@ #include <asm/stacktrace.h> -#define STACKSLOTS_PER_LINE 8 -#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) - -int panic_on_unrecovered_nmi; -int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; -static unsigned int code_bytes = 64; -static int die_counter; - -void printk_address(unsigned long address, int reliable) -{ - printk(" [<%p>] %s%pS\n", (void *) address, - reliable ? "" : "? ", (void *) address); -} - -static inline int valid_stack_ptr(struct thread_info *tinfo, - void *p, unsigned int size, void *end) -{ - void *t = tinfo; - if (end) { - if (p < end && p >= (end-THREAD_SIZE)) - return 1; - else - return 0; - } - return p > t && p < t + THREAD_SIZE - size; -} - -/* The form of the top of the frame on the stack */ -struct stack_frame { - struct stack_frame *next_frame; - unsigned long return_address; -}; - -static inline unsigned long -print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - unsigned long *end) -{ - struct stack_frame *frame = (struct stack_frame *)bp; - - while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { - unsigned long addr; - - addr = *stack; - if (__kernel_text_address(addr)) { - if ((unsigned long) stack == bp + sizeof(long)) { - ops->address(data, addr, 1); - frame = frame->next_frame; - bp = (unsigned long) frame; - } else { - ops->address(data, addr, bp == 0); - } - } - stack++; - } - return bp; -} +#include "dumpstack.h" void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { + int graph = 0; + if (!task) task = current; @@ -107,7 +52,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, context = (struct thread_info *) ((unsigned long)stack & (~(THREAD_SIZE - 1))); - bp = print_context_stack(context, stack, bp, ops, data, NULL); + bp = print_context_stack(context, stack, bp, ops, + data, NULL, &graph); stack = (unsigned long *)context->previous_esp; if (!stack) @@ -119,57 +65,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, } EXPORT_SYMBOL(dump_trace); -static void -print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - printk(data); - print_symbol(msg, symbol); - printk("\n"); -} - -static void print_trace_warning(void *data, char *msg) -{ - printk("%s%s\n", (char *)data, msg); -} - -static int print_trace_stack(void *data, char *name) -{ - printk("%s <%s> ", (char *)data, name); - return 0; -} - -/* - * Print one address/symbol entries per line. - */ -static void print_trace_address(void *data, unsigned long addr, int reliable) -{ - touch_nmi_watchdog(); - printk(data); - printk_address(addr, reliable); -} - -static const struct stacktrace_ops print_trace_ops = { - .warning = print_trace_warning, - .warning_symbol = print_trace_warning_symbol, - .stack = print_trace_stack, - .address = print_trace_address, -}; - -static void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl) -{ - printk("%sCall Trace:\n", log_lvl); - dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); -} - -void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp) -{ - show_trace_log_lvl(task, regs, stack, bp, ""); -} - -static void +void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *sp, unsigned long bp, char *log_lvl) { @@ -196,33 +92,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, show_trace_log_lvl(task, regs, sp, bp, log_lvl); } -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_log_lvl(task, NULL, sp, 0, ""); -} - -/* - * The architecture-independent dump_stack generator - */ -void dump_stack(void) -{ - unsigned long bp = 0; - unsigned long stack; - -#ifdef CONFIG_FRAME_POINTER - if (!bp) - get_bp(bp); -#endif - - printk("Pid: %d, comm: %.20s %s %s %.*s\n", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - show_trace(NULL, NULL, &stack, bp); -} - -EXPORT_SYMBOL(dump_stack); void show_registers(struct pt_regs *regs) { @@ -283,167 +152,3 @@ int is_valid_bugaddr(unsigned long ip) return ud2 == 0x0b0f; } -static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; -static int die_owner = -1; -static unsigned int die_nest_count; - -unsigned __kprobes long oops_begin(void) -{ - unsigned long flags; - - oops_enter(); - - if (die_owner != raw_smp_processor_id()) { - console_verbose(); - raw_local_irq_save(flags); - __raw_spin_lock(&die_lock); - die_owner = smp_processor_id(); - die_nest_count = 0; - bust_spinlocks(1); - } else { - raw_local_irq_save(flags); - } - die_nest_count++; - return flags; -} - -void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) -{ - bust_spinlocks(0); - die_owner = -1; - add_taint(TAINT_DIE); - __raw_spin_unlock(&die_lock); - raw_local_irq_restore(flags); - - if (!regs) - return; - - if (kexec_should_crash(current)) - crash_kexec(regs); - if (in_interrupt()) - panic("Fatal exception in interrupt"); - if (panic_on_oops) - panic("Fatal exception"); - oops_exit(); - do_exit(signr); -} - -int __kprobes __die(const char *str, struct pt_regs *regs, long err) -{ - unsigned short ss; - unsigned long sp; - - printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP - printk("SMP "); -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC"); -#endif - printk("\n"); - sysfs_printk_last_file(); - if (notify_die(DIE_OOPS, str, regs, err, - current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) - return 1; - - show_registers(regs); - /* Executive summary in case the oops scrolled away */ - sp = (unsigned long) (®s->sp); - savesegment(ss, ss); - if (user_mode(regs)) { - sp = regs->sp; - ss = regs->ss & 0xffff; - } - printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); - print_symbol("%s", regs->ip); - printk(" SS:ESP %04x:%08lx\n", ss, sp); - return 0; -} - -/* - * This is gone through when something in the kernel has done something bad - * and is about to be terminated: - */ -void die(const char *str, struct pt_regs *regs, long err) -{ - unsigned long flags = oops_begin(); - - if (die_nest_count < 3) { - report_bug(regs->ip, regs); - - if (__die(str, regs, err)) - regs = NULL; - } else { - printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); - } - - oops_end(flags, regs, SIGSEGV); -} - -static DEFINE_SPINLOCK(nmi_print_lock); - -void notrace __kprobes -die_nmi(char *str, struct pt_regs *regs, int do_panic) -{ - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) - return; - - spin_lock(&nmi_print_lock); - /* - * We are in trouble anyway, lets at least try - * to get a message out: - */ - bust_spinlocks(1); - printk(KERN_EMERG "%s", str); - printk(" on CPU%d, ip %08lx, registers:\n", - smp_processor_id(), regs->ip); - show_registers(regs); - if (do_panic) - panic("Non maskable interrupt"); - console_silent(); - spin_unlock(&nmi_print_lock); - - /* - * If we are in kernel we are probably nested up pretty bad - * and might aswell get out now while we still can: - */ - if (!user_mode_vm(regs)) { - current->thread.trap_no = 2; - crash_kexec(regs); - } - - bust_spinlocks(0); - do_exit(SIGSEGV); -} - -static int __init oops_setup(char *s) -{ - if (!s) - return -EINVAL; - if (!strcmp(s, "panic")) - panic_on_oops = 1; - return 0; -} -early_param("oops", oops_setup); - -static int __init kstack_setup(char *s) -{ - if (!s) - return -EINVAL; - kstack_depth_to_print = simple_strtoul(s, NULL, 0); - return 0; -} -early_param("kstack", kstack_setup); - -static int __init code_bytes_setup(char *s) -{ - code_bytes = simple_strtoul(s, NULL, 0); - if (code_bytes > 8192) - code_bytes = 8192; - - return 1; -} -__setup("code_bytes=", code_bytes_setup); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 96a5db7da8a..c302d070704 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -17,19 +17,7 @@ #include <asm/stacktrace.h> -#define STACKSLOTS_PER_LINE 4 -#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) - -int panic_on_unrecovered_nmi; -int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; -static unsigned int code_bytes = 64; -static int die_counter; - -void printk_address(unsigned long address, int reliable) -{ - printk(" [<%p>] %s%pS\n", (void *) address, - reliable ? "" : "? ", (void *) address); -} +#include "dumpstack.h" static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, unsigned *usedp, char **idp) @@ -113,51 +101,6 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack */ -static inline int valid_stack_ptr(struct thread_info *tinfo, - void *p, unsigned int size, void *end) -{ - void *t = tinfo; - if (end) { - if (p < end && p >= (end-THREAD_SIZE)) - return 1; - else - return 0; - } - return p > t && p < t + THREAD_SIZE - size; -} - -/* The form of the top of the frame on the stack */ -struct stack_frame { - struct stack_frame *next_frame; - unsigned long return_address; -}; - -static inline unsigned long -print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - unsigned long *end) -{ - struct stack_frame *frame = (struct stack_frame *)bp; - - while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { - unsigned long addr; - - addr = *stack; - if (__kernel_text_address(addr)) { - if ((unsigned long) stack == bp + sizeof(long)) { - ops->address(data, addr, 1); - frame = frame->next_frame; - bp = (unsigned long) frame; - } else { - ops->address(data, addr, bp == 0); - } - } - stack++; - } - return bp; -} - void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) @@ -166,6 +109,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; unsigned used = 0; struct thread_info *tinfo; + int graph = 0; if (!task) task = current; @@ -206,7 +150,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, break; bp = print_context_stack(tinfo, stack, bp, ops, - data, estack_end); + data, estack_end, &graph); ops->stack(data, "<EOE>"); /* * We link to the next stack via the @@ -225,7 +169,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (ops->stack(data, "IRQ") < 0) break; bp = print_context_stack(tinfo, stack, bp, - ops, data, irqstack_end); + ops, data, irqstack_end, &graph); /* * We link to the next stack (which would be * the process stack normally) the last @@ -243,62 +187,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, /* * This handles the process stack: */ - bp = print_context_stack(tinfo, stack, bp, ops, data, NULL); + bp = print_context_stack(tinfo, stack, bp, ops, data, NULL, &graph); put_cpu(); } EXPORT_SYMBOL(dump_trace); -static void -print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - printk(data); - print_symbol(msg, symbol); - printk("\n"); -} - -static void print_trace_warning(void *data, char *msg) -{ - printk("%s%s\n", (char *)data, msg); -} - -static int print_trace_stack(void *data, char *name) -{ - printk("%s <%s> ", (char *)data, name); - return 0; -} - -/* - * Print one address/symbol entries per line. - */ -static void print_trace_address(void *data, unsigned long addr, int reliable) -{ - touch_nmi_watchdog(); - printk(data); - printk_address(addr, reliable); -} - -static const struct stacktrace_ops print_trace_ops = { - .warning = print_trace_warning, - .warning_symbol = print_trace_warning_symbol, - .stack = print_trace_stack, - .address = print_trace_address, -}; - -static void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl) -{ - printk("%sCall Trace:\n", log_lvl); - dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); -} - -void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp) -{ - show_trace_log_lvl(task, regs, stack, bp, ""); -} - -static void +void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *sp, unsigned long bp, char *log_lvl) { @@ -342,33 +236,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, show_trace_log_lvl(task, regs, sp, bp, log_lvl); } -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_log_lvl(task, NULL, sp, 0, ""); -} - -/* - * The architecture-independent dump_stack generator - */ -void dump_stack(void) -{ - unsigned long bp = 0; - unsigned long stack; - -#ifdef CONFIG_FRAME_POINTER - if (!bp) - get_bp(bp); -#endif - - printk("Pid: %d, comm: %.20s %s %s %.*s\n", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - show_trace(NULL, NULL, &stack, bp); -} -EXPORT_SYMBOL(dump_stack); - void show_registers(struct pt_regs *regs) { int i; @@ -429,147 +296,3 @@ int is_valid_bugaddr(unsigned long ip) return ud2 == 0x0b0f; } -static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; -static int die_owner = -1; -static unsigned int die_nest_count; - -unsigned __kprobes long oops_begin(void) -{ - int cpu; - unsigned long flags; - - oops_enter(); - - /* racy, but better than risking deadlock. */ - raw_local_irq_save(flags); - cpu = smp_processor_id(); - if (!__raw_spin_trylock(&die_lock)) { - if (cpu == die_owner) - /* nested oops. should stop eventually */; - else - __raw_spin_lock(&die_lock); - } - die_nest_count++; - die_owner = cpu; - console_verbose(); - bust_spinlocks(1); - return flags; -} - -void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) -{ - die_owner = -1; - bust_spinlocks(0); - die_nest_count--; - if (!die_nest_count) - /* Nest count reaches zero, release the lock. */ - __raw_spin_unlock(&die_lock); - raw_local_irq_restore(flags); - if (!regs) { - oops_exit(); - return; - } - if (in_interrupt()) - panic("Fatal exception in interrupt"); - if (panic_on_oops) - panic("Fatal exception"); - oops_exit(); - do_exit(signr); -} - -int __kprobes __die(const char *str, struct pt_regs *regs, long err) -{ - printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP - printk("SMP "); -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC"); -#endif - printk("\n"); - sysfs_printk_last_file(); - if (notify_die(DIE_OOPS, str, regs, err, - current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) - return 1; - - show_registers(regs); - add_taint(TAINT_DIE); - /* Executive summary in case the oops scrolled away */ - printk(KERN_ALERT "RIP "); - printk_address(regs->ip, 1); - printk(" RSP <%016lx>\n", regs->sp); - if (kexec_should_crash(current)) - crash_kexec(regs); - return 0; -} - -void die(const char *str, struct pt_regs *regs, long err) -{ - unsigned long flags = oops_begin(); - - if (!user_mode(regs)) - report_bug(regs->ip, regs); - - if (__die(str, regs, err)) - regs = NULL; - oops_end(flags, regs, SIGSEGV); -} - -notrace __kprobes void -die_nmi(char *str, struct pt_regs *regs, int do_panic) -{ - unsigned long flags; - - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) - return; - - flags = oops_begin(); - /* - * We are in trouble anyway, lets at least try - * to get a message out. - */ - printk(KERN_EMERG "%s", str); - printk(" on CPU%d, ip %08lx, registers:\n", - smp_processor_id(), regs->ip); - show_registers(regs); - if (kexec_should_crash(current)) - crash_kexec(regs); - if (do_panic || panic_on_oops) - panic("Non maskable interrupt"); - oops_end(flags, NULL, SIGBUS); - nmi_exit(); - local_irq_enable(); - do_exit(SIGBUS); -} - -static int __init oops_setup(char *s) -{ - if (!s) - return -EINVAL; - if (!strcmp(s, "panic")) - panic_on_oops = 1; - return 0; -} -early_param("oops", oops_setup); - -static int __init kstack_setup(char *s) -{ - if (!s) - return -EINVAL; - kstack_depth_to_print = simple_strtoul(s, NULL, 0); - return 0; -} -early_param("kstack", kstack_setup); - -static int __init code_bytes_setup(char *s) -{ - code_bytes = simple_strtoul(s, NULL, 0); - if (code_bytes > 8192) - code_bytes = 8192; - - return 1; -} -__setup("code_bytes=", code_bytes_setup); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 7aafeb5263e..65a13943e09 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -677,22 +677,6 @@ struct early_res { }; static struct early_res early_res[MAX_EARLY_RES] __initdata = { { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */ -#if defined(CONFIG_X86_64) && defined(CONFIG_X86_TRAMPOLINE) - { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" }, -#endif -#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) - /* - * But first pinch a few for the stack/trampoline stuff - * FIXME: Don't need the extra page at 4K, but need to fix - * trampoline before removing it. (see the GDT stuff) - */ - { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE" }, - /* - * Has to be in very low memory so we can execute - * real-mode AP code. - */ - { TRAMPOLINE_BASE, TRAMPOLINE_BASE + PAGE_SIZE, "TRAMPOLINE" }, -#endif {} }; diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 1b894b72c0f..744aa7fc49d 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -17,6 +17,7 @@ #include <asm/io_apic.h> #include <asm/apic.h> #include <asm/iommu.h> +#include <asm/gart.h> static void __init fix_hypertransport_config(int num, int slot, int func) { diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 34ad997d383..23b138e31e9 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -875,49 +875,6 @@ static struct console early_dbgp_console = { }; #endif -/* Console interface to a host file on AMD's SimNow! */ - -static int simnow_fd; - -enum { - MAGIC1 = 0xBACCD00A, - MAGIC2 = 0xCA110000, - XOPEN = 5, - XWRITE = 4, -}; - -static noinline long simnow(long cmd, long a, long b, long c) -{ - long ret; - - asm volatile("cpuid" : - "=a" (ret) : - "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2)); - return ret; -} - -static void __init simnow_init(char *str) -{ - char *fn = "klog"; - - if (*str == '=') - fn = ++str; - /* error ignored */ - simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644); -} - -static void simnow_write(struct console *con, const char *s, unsigned n) -{ - simnow(XWRITE, simnow_fd, (unsigned long)s, n); -} - -static struct console simnow_console = { - .name = "simnow", - .write = simnow_write, - .flags = CON_PRINTBUFFER, - .index = -1, -}; - /* Direct interface for emergencies */ static struct console *early_console = &early_vga_console; static int __initdata early_console_initialized; @@ -960,10 +917,6 @@ static int __init setup_early_printk(char *buf) max_ypos = boot_params.screen_info.orig_video_lines; current_ypos = boot_params.screen_info.orig_y; early_console = &early_vga_console; - } else if (!strncmp(buf, "simnow", 6)) { - simnow_init(buf + 6); - early_console = &simnow_console; - keep_early = 1; #ifdef CONFIG_EARLY_PRINTK_DBGP } else if (!strncmp(buf, "dbgp", 4)) { if (early_dbgp_init(buf+4) < 0) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index fe7014176eb..d6f0490a739 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -954,6 +954,9 @@ ENTRY(mcount) END(mcount) ENTRY(ftrace_caller) + cmpl $0, function_trace_stop + jne ftrace_stub + pushl %eax pushl %ecx pushl %edx @@ -968,6 +971,11 @@ ftrace_call: popl %edx popl %ecx popl %eax +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: + jmp ftrace_stub +#endif .globl ftrace_stub ftrace_stub: @@ -977,8 +985,18 @@ END(ftrace_caller) #else /* ! CONFIG_DYNAMIC_FTRACE */ ENTRY(mcount) + cmpl $0, function_trace_stop + jne ftrace_stub + cmpl $ftrace_stub, ftrace_trace_function jnz trace +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + cmpl $ftrace_stub, ftrace_graph_return + jnz ftrace_graph_caller + + cmpl $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller +#endif .globl ftrace_stub ftrace_stub: ret @@ -997,12 +1015,43 @@ trace: popl %edx popl %ecx popl %eax - jmp ftrace_stub END(mcount) #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* CONFIG_FUNCTION_TRACER */ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) + cmpl $0, function_trace_stop + jne ftrace_stub + + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %edx + lea 0x4(%ebp), %eax + subl $MCOUNT_INSN_SIZE, %edx + call prepare_ftrace_return + popl %edx + popl %ecx + popl %eax + ret +END(ftrace_graph_caller) + +.globl return_to_handler +return_to_handler: + pushl $0 + pushl %eax + pushl %ecx + pushl %edx + call ftrace_return_to_handler + movl %eax, 0xc(%esp) + popl %edx + popl %ecx + popl %eax + ret +#endif + .section .rodata,"a" #include "syscall_table_32.S" diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index fc013cfde30..1954a966220 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -67,16 +67,10 @@ ENTRY(mcount) END(mcount) ENTRY(ftrace_caller) + cmpl $0, function_trace_stop + jne ftrace_stub - /* taken from glibc */ - subq $0x38, %rsp - movq %rax, (%rsp) - movq %rcx, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rsi, 24(%rsp) - movq %rdi, 32(%rsp) - movq %r8, 40(%rsp) - movq %r9, 48(%rsp) + MCOUNT_SAVE_FRAME movq 0x38(%rsp), %rdi movq 8(%rbp), %rsi @@ -86,14 +80,13 @@ ENTRY(ftrace_caller) ftrace_call: call ftrace_stub - movq 48(%rsp), %r9 - movq 40(%rsp), %r8 - movq 32(%rsp), %rdi - movq 24(%rsp), %rsi - movq 16(%rsp), %rdx - movq 8(%rsp), %rcx - movq (%rsp), %rax - addq $0x38, %rsp + MCOUNT_RESTORE_FRAME + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: + jmp ftrace_stub +#endif .globl ftrace_stub ftrace_stub: @@ -102,15 +95,63 @@ END(ftrace_caller) #else /* ! CONFIG_DYNAMIC_FTRACE */ ENTRY(mcount) + cmpl $0, function_trace_stop + jne ftrace_stub + cmpq $ftrace_stub, ftrace_trace_function jnz trace + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + cmpq $ftrace_stub, ftrace_graph_return + jnz ftrace_graph_caller + + cmpq $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller +#endif + .globl ftrace_stub ftrace_stub: retq trace: - /* taken from glibc */ - subq $0x38, %rsp + MCOUNT_SAVE_FRAME + + movq 0x38(%rsp), %rdi + movq 8(%rbp), %rsi + subq $MCOUNT_INSN_SIZE, %rdi + + call *ftrace_trace_function + + MCOUNT_RESTORE_FRAME + + jmp ftrace_stub +END(mcount) +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) + cmpl $0, function_trace_stop + jne ftrace_stub + + MCOUNT_SAVE_FRAME + + leaq 8(%rbp), %rdi + movq 0x38(%rsp), %rsi + subq $MCOUNT_INSN_SIZE, %rsi + + call prepare_ftrace_return + + MCOUNT_RESTORE_FRAME + + retq +END(ftrace_graph_caller) + + +.globl return_to_handler +return_to_handler: + subq $80, %rsp + movq %rax, (%rsp) movq %rcx, 8(%rsp) movq %rdx, 16(%rsp) @@ -118,13 +159,14 @@ trace: movq %rdi, 32(%rsp) movq %r8, 40(%rsp) movq %r9, 48(%rsp) + movq %r10, 56(%rsp) + movq %r11, 64(%rsp) - movq 0x38(%rsp), %rdi - movq 8(%rbp), %rsi - subq $MCOUNT_INSN_SIZE, %rdi - - call *ftrace_trace_function + call ftrace_return_to_handler + movq %rax, 72(%rsp) + movq 64(%rsp), %r11 + movq 56(%rsp), %r10 movq 48(%rsp), %r9 movq 40(%rsp), %r8 movq 32(%rsp), %rdi @@ -132,12 +174,10 @@ trace: movq 16(%rsp), %rdx movq 8(%rsp), %rcx movq (%rsp), %rax - addq $0x38, %rsp + addq $72, %rsp + retq +#endif - jmp ftrace_stub -END(mcount) -#endif /* CONFIG_DYNAMIC_FTRACE */ -#endif /* CONFIG_FUNCTION_TRACER */ #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c index 0aa2c443d60..53699c931ad 100644 --- a/arch/x86/kernel/es7000_32.c +++ b/arch/x86/kernel/es7000_32.c @@ -38,8 +38,11 @@ #include <asm/io.h> #include <asm/nmi.h> #include <asm/smp.h> +#include <asm/atomic.h> #include <asm/apicdef.h> #include <mach_mpparse.h> +#include <asm/genapic.h> +#include <asm/setup.h> /* * ES7000 chipsets @@ -161,6 +164,43 @@ es7000_rename_gsi(int ioapic, int gsi) return gsi; } +static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) +{ + unsigned long vect = 0, psaival = 0; + + if (psai == NULL) + return -1; + + vect = ((unsigned long)__pa(eip)/0x1000) << 16; + psaival = (0x1000000 | vect | cpu); + + while (*psai & 0x1000000) + ; + + *psai = psaival; + + return 0; +} + +static void noop_wait_for_deassert(atomic_t *deassert_not_used) +{ +} + +static int __init es7000_update_genapic(void) +{ + genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip; + + /* MPENTIUMIII */ + if (boot_cpu_data.x86 == 6 && + (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11)) { + es7000_update_genapic_to_cluster(); + genapic->wait_for_init_deassert = noop_wait_for_deassert; + genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip; + } + + return 0; +} + void __init setup_unisys(void) { @@ -176,6 +216,8 @@ setup_unisys(void) else es7000_plat = ES7000_CLASSIC; ioapic_renumber_irq = es7000_rename_gsi; + + x86_quirks->update_genapic = es7000_update_genapic; } /* @@ -317,26 +359,6 @@ es7000_mip_write(struct mip_reg *mip_reg) return status; } -int -es7000_start_cpu(int cpu, unsigned long eip) -{ - unsigned long vect = 0, psaival = 0; - - if (psai == NULL) - return -1; - - vect = ((unsigned long)__pa(eip)/0x1000) << 16; - psaival = (0x1000000 | vect | cpu); - - while (*psai & 0x1000000) - ; - - *psai = psaival; - - return 0; - -} - void __init es7000_sw_apic(void) { diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 50ea0ac8c9b..1b43086b097 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -14,14 +14,17 @@ #include <linux/uaccess.h> #include <linux/ftrace.h> #include <linux/percpu.h> +#include <linux/sched.h> #include <linux/init.h> #include <linux/list.h> #include <asm/ftrace.h> +#include <linux/ftrace.h> #include <asm/nops.h> +#include <asm/nmi.h> -static unsigned char ftrace_nop[MCOUNT_INSN_SIZE]; +#ifdef CONFIG_DYNAMIC_FTRACE union ftrace_code_union { char code[MCOUNT_INSN_SIZE]; @@ -31,18 +34,12 @@ union ftrace_code_union { } __attribute__((packed)); }; - static int ftrace_calc_offset(long ip, long addr) { return (int)(addr - ip); } -unsigned char *ftrace_nop_replace(void) -{ - return ftrace_nop; -} - -unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) +static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) { static union ftrace_code_union calc; @@ -56,7 +53,142 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) return calc.code; } -int +/* + * Modifying code must take extra care. On an SMP machine, if + * the code being modified is also being executed on another CPU + * that CPU will have undefined results and possibly take a GPF. + * We use kstop_machine to stop other CPUS from exectuing code. + * But this does not stop NMIs from happening. We still need + * to protect against that. We separate out the modification of + * the code to take care of this. + * + * Two buffers are added: An IP buffer and a "code" buffer. + * + * 1) Put the instruction pointer into the IP buffer + * and the new code into the "code" buffer. + * 2) Set a flag that says we are modifying code + * 3) Wait for any running NMIs to finish. + * 4) Write the code + * 5) clear the flag. + * 6) Wait for any running NMIs to finish. + * + * If an NMI is executed, the first thing it does is to call + * "ftrace_nmi_enter". This will check if the flag is set to write + * and if it is, it will write what is in the IP and "code" buffers. + * + * The trick is, it does not matter if everyone is writing the same + * content to the code location. Also, if a CPU is executing code + * it is OK to write to that code location if the contents being written + * are the same as what exists. + */ + +static atomic_t in_nmi = ATOMIC_INIT(0); +static int mod_code_status; /* holds return value of text write */ +static int mod_code_write; /* set when NMI should do the write */ +static void *mod_code_ip; /* holds the IP to write to */ +static void *mod_code_newcode; /* holds the text to write to the IP */ + +static unsigned nmi_wait_count; +static atomic_t nmi_update_count = ATOMIC_INIT(0); + +int ftrace_arch_read_dyn_info(char *buf, int size) +{ + int r; + + r = snprintf(buf, size, "%u %u", + nmi_wait_count, + atomic_read(&nmi_update_count)); + return r; +} + +static void ftrace_mod_code(void) +{ + /* + * Yes, more than one CPU process can be writing to mod_code_status. + * (and the code itself) + * But if one were to fail, then they all should, and if one were + * to succeed, then they all should. + */ + mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode, + MCOUNT_INSN_SIZE); +} + +void ftrace_nmi_enter(void) +{ + atomic_inc(&in_nmi); + /* Must have in_nmi seen before reading write flag */ + smp_mb(); + if (mod_code_write) { + ftrace_mod_code(); + atomic_inc(&nmi_update_count); + } +} + +void ftrace_nmi_exit(void) +{ + /* Finish all executions before clearing in_nmi */ + smp_wmb(); + atomic_dec(&in_nmi); +} + +static void wait_for_nmi(void) +{ + int waited = 0; + + while (atomic_read(&in_nmi)) { + waited = 1; + cpu_relax(); + } + + if (waited) + nmi_wait_count++; +} + +static int +do_ftrace_mod_code(unsigned long ip, void *new_code) +{ + mod_code_ip = (void *)ip; + mod_code_newcode = new_code; + + /* The buffers need to be visible before we let NMIs write them */ + smp_wmb(); + + mod_code_write = 1; + + /* Make sure write bit is visible before we wait on NMIs */ + smp_mb(); + + wait_for_nmi(); + + /* Make sure all running NMIs have finished before we write the code */ + smp_mb(); + + ftrace_mod_code(); + + /* Make sure the write happens before clearing the bit */ + smp_wmb(); + + mod_code_write = 0; + + /* make sure NMIs see the cleared bit */ + smp_mb(); + + wait_for_nmi(); + + return mod_code_status; +} + + + + +static unsigned char ftrace_nop[MCOUNT_INSN_SIZE]; + +static unsigned char *ftrace_nop_replace(void) +{ + return ftrace_nop; +} + +static int ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char *new_code) { @@ -81,7 +213,7 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, return -EINVAL; /* replace the text with the new text */ - if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE)) + if (do_ftrace_mod_code(ip, new_code)) return -EPERM; sync_core(); @@ -89,6 +221,29 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, return 0; } +int ftrace_make_nop(struct module *mod, + struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned char *new, *old; + unsigned long ip = rec->ip; + + old = ftrace_call_replace(ip, addr); + new = ftrace_nop_replace(); + + return ftrace_modify_code(rec->ip, old, new); +} + +int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned char *new, *old; + unsigned long ip = rec->ip; + + old = ftrace_nop_replace(); + new = ftrace_call_replace(ip, addr); + + return ftrace_modify_code(rec->ip, old, new); +} + int ftrace_update_ftrace_func(ftrace_func_t func) { unsigned long ip = (unsigned long)(&ftrace_call); @@ -165,3 +320,218 @@ int __init ftrace_dyn_arch_init(void *data) return 0; } +#endif + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +#ifdef CONFIG_DYNAMIC_FTRACE +extern void ftrace_graph_call(void); + +static int ftrace_mod_jmp(unsigned long ip, + int old_offset, int new_offset) +{ + unsigned char code[MCOUNT_INSN_SIZE]; + + if (probe_kernel_read(code, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + + if (code[0] != 0xe9 || old_offset != *(int *)(&code[1])) + return -EINVAL; + + *(int *)(&code[1]) = new_offset; + + if (do_ftrace_mod_code(ip, &code)) + return -EPERM; + + return 0; +} + +int ftrace_enable_ftrace_graph_caller(void) +{ + unsigned long ip = (unsigned long)(&ftrace_graph_call); + int old_offset, new_offset; + + old_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE); + new_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE); + + return ftrace_mod_jmp(ip, old_offset, new_offset); +} + +int ftrace_disable_ftrace_graph_caller(void) +{ + unsigned long ip = (unsigned long)(&ftrace_graph_call); + int old_offset, new_offset; + + old_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE); + new_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE); + + return ftrace_mod_jmp(ip, old_offset, new_offset); +} + +#else /* CONFIG_DYNAMIC_FTRACE */ + +/* + * These functions are picked from those used on + * this page for dynamic ftrace. They have been + * simplified to ignore all traces in NMI context. + */ +static atomic_t in_nmi; + +void ftrace_nmi_enter(void) +{ + atomic_inc(&in_nmi); +} + +void ftrace_nmi_exit(void) +{ + atomic_dec(&in_nmi); +} + +#endif /* !CONFIG_DYNAMIC_FTRACE */ + +/* Add a function return address to the trace stack on thread info.*/ +static int push_return_trace(unsigned long ret, unsigned long long time, + unsigned long func, int *depth) +{ + int index; + + if (!current->ret_stack) + return -EBUSY; + + /* The return trace stack is full */ + if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) { + atomic_inc(¤t->trace_overrun); + return -EBUSY; + } + + index = ++current->curr_ret_stack; + barrier(); + current->ret_stack[index].ret = ret; + current->ret_stack[index].func = func; + current->ret_stack[index].calltime = time; + *depth = index; + + return 0; +} + +/* Retrieve a function return address to the trace stack on thread info.*/ +static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) +{ + int index; + + index = current->curr_ret_stack; + + if (unlikely(index < 0)) { + ftrace_graph_stop(); + WARN_ON(1); + /* Might as well panic, otherwise we have no where to go */ + *ret = (unsigned long)panic; + return; + } + + *ret = current->ret_stack[index].ret; + trace->func = current->ret_stack[index].func; + trace->calltime = current->ret_stack[index].calltime; + trace->overrun = atomic_read(¤t->trace_overrun); + trace->depth = index; + barrier(); + current->curr_ret_stack--; + +} + +/* + * Send the trace to the ring-buffer. + * @return the original return address. + */ +unsigned long ftrace_return_to_handler(void) +{ + struct ftrace_graph_ret trace; + unsigned long ret; + + pop_return_trace(&trace, &ret); + trace.rettime = cpu_clock(raw_smp_processor_id()); + ftrace_graph_return(&trace); + + if (unlikely(!ret)) { + ftrace_graph_stop(); + WARN_ON(1); + /* Might as well panic. What else to do? */ + ret = (unsigned long)panic; + } + + return ret; +} + +/* + * Hook the return address and push it in the stack of return addrs + * in current thread info. + */ +void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) +{ + unsigned long old; + unsigned long long calltime; + int faulted; + struct ftrace_graph_ent trace; + unsigned long return_hooker = (unsigned long) + &return_to_handler; + + /* Nmi's are currently unsupported */ + if (unlikely(atomic_read(&in_nmi))) + return; + + if (unlikely(atomic_read(¤t->tracing_graph_pause))) + return; + + /* + * Protect against fault, even if it shouldn't + * happen. This tool is too much intrusive to + * ignore such a protection. + */ + asm volatile( + "1: " _ASM_MOV " (%[parent_old]), %[old]\n" + "2: " _ASM_MOV " %[return_hooker], (%[parent_replaced])\n" + " movl $0, %[faulted]\n" + + ".section .fixup, \"ax\"\n" + "3: movl $1, %[faulted]\n" + ".previous\n" + + _ASM_EXTABLE(1b, 3b) + _ASM_EXTABLE(2b, 3b) + + : [parent_replaced] "=r" (parent), [old] "=r" (old), + [faulted] "=r" (faulted) + : [parent_old] "0" (parent), [return_hooker] "r" (return_hooker) + : "memory" + ); + + if (unlikely(faulted)) { + ftrace_graph_stop(); + WARN_ON(1); + return; + } + + if (unlikely(!__kernel_text_address(old))) { + ftrace_graph_stop(); + *parent = old; + WARN_ON(1); + return; + } + + calltime = cpu_clock(raw_smp_processor_id()); + + if (push_return_trace(old, calltime, + self_addr, &trace.depth) == -EBUSY) { + *parent = old; + return; + } + + trace.func = self_addr; + + /* Only trace if the calling function expects to */ + if (!ftrace_graph_entry(&trace)) { + current->curr_ret_stack--; + *parent = old; + } +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c index 6c9bfc9e1e9..2bced78b0b8 100644 --- a/arch/x86/kernel/genapic_64.c +++ b/arch/x86/kernel/genapic_64.c @@ -21,6 +21,7 @@ #include <asm/smp.h> #include <asm/ipi.h> #include <asm/genapic.h> +#include <asm/setup.h> extern struct genapic apic_flat; extern struct genapic apic_physflat; @@ -53,6 +54,9 @@ void __init setup_apic_routing(void) genapic = &apic_physflat; printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); } + + if (x86_quirks->update_genapic) + x86_quirks->update_genapic(); } /* Same for both flat and physical. */ diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index 2c7dbdb9827..dece1728973 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -10,6 +10,7 @@ #include <linux/kernel.h> #include <linux/threads.h> +#include <linux/cpu.h> #include <linux/cpumask.h> #include <linux/string.h> #include <linux/ctype.h> @@ -17,6 +18,9 @@ #include <linux/sched.h> #include <linux/module.h> #include <linux/hardirq.h> +#include <linux/timer.h> +#include <linux/proc_fs.h> +#include <asm/current.h> #include <asm/smp.h> #include <asm/ipi.h> #include <asm/genapic.h> @@ -356,6 +360,103 @@ static __init void uv_rtc_init(void) } /* + * percpu heartbeat timer + */ +static void uv_heartbeat(unsigned long ignored) +{ + struct timer_list *timer = &uv_hub_info->scir.timer; + unsigned char bits = uv_hub_info->scir.state; + + /* flip heartbeat bit */ + bits ^= SCIR_CPU_HEARTBEAT; + + /* is this cpu idle? */ + if (idle_cpu(raw_smp_processor_id())) + bits &= ~SCIR_CPU_ACTIVITY; + else + bits |= SCIR_CPU_ACTIVITY; + + /* update system controller interface reg */ + uv_set_scir_bits(bits); + + /* enable next timer period */ + mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL); +} + +static void __cpuinit uv_heartbeat_enable(int cpu) +{ + if (!uv_cpu_hub_info(cpu)->scir.enabled) { + struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer; + + uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); + setup_timer(timer, uv_heartbeat, cpu); + timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; + add_timer_on(timer, cpu); + uv_cpu_hub_info(cpu)->scir.enabled = 1; + } + + /* check boot cpu */ + if (!uv_cpu_hub_info(0)->scir.enabled) + uv_heartbeat_enable(0); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void __cpuinit uv_heartbeat_disable(int cpu) +{ + if (uv_cpu_hub_info(cpu)->scir.enabled) { + uv_cpu_hub_info(cpu)->scir.enabled = 0; + del_timer(&uv_cpu_hub_info(cpu)->scir.timer); + } + uv_set_cpu_scir_bits(cpu, 0xff); +} + +/* + * cpu hotplug notifier + */ +static __cpuinit int uv_scir_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + + switch (action) { + case CPU_ONLINE: + uv_heartbeat_enable(cpu); + break; + case CPU_DOWN_PREPARE: + uv_heartbeat_disable(cpu); + break; + default: + break; + } + return NOTIFY_OK; +} + +static __init void uv_scir_register_cpu_notifier(void) +{ + hotcpu_notifier(uv_scir_cpu_notify, 0); +} + +#else /* !CONFIG_HOTPLUG_CPU */ + +static __init void uv_scir_register_cpu_notifier(void) +{ +} + +static __init int uv_init_heartbeat(void) +{ + int cpu; + + if (is_uv_system()) + for_each_online_cpu(cpu) + uv_heartbeat_enable(cpu); + return 0; +} + +late_initcall(uv_init_heartbeat); + +#endif /* !CONFIG_HOTPLUG_CPU */ + +/* * Called on each cpu to initialize the per_cpu UV data area. * ZZZ hotplug not supported yet */ @@ -428,7 +529,7 @@ void __init uv_system_init(void) uv_bios_init(); uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, - &uv_coherency_id, &uv_region_size); + &sn_coherency_id, &sn_region_size); uv_rtc_init(); for_each_present_cpu(cpu) { @@ -439,8 +540,7 @@ void __init uv_system_init(void) uv_blade_info[blade].nr_possible_cpus++; uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; - uv_cpu_hub_info(cpu)->lowmem_remap_top = - lowmem_redir_base + lowmem_redir_size; + uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; uv_cpu_hub_info(cpu)->m_val = m_val; uv_cpu_hub_info(cpu)->n_val = m_val; uv_cpu_hub_info(cpu)->numa_blade_id = blade; @@ -450,7 +550,8 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; - uv_cpu_hub_info(cpu)->coherency_domain_number = uv_coherency_id; + uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; + uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; uv_node_to_blade[nid] = blade; uv_cpu_to_blade[cpu] = blade; max_pnode = max(pnode, max_pnode); @@ -467,4 +568,6 @@ void __init uv_system_init(void) map_mmioh_high(max_pnode); uv_cpu_init(); + uv_scir_register_cpu_notifier(); + proc_mkdir("sgi_uv", NULL); } diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c index 1dcb0f13897..3e66bd364a9 100644 --- a/arch/x86/kernel/head.c +++ b/arch/x86/kernel/head.c @@ -35,7 +35,6 @@ void __init reserve_ebda_region(void) /* start of EBDA area */ ebda_addr = get_bios_ebda(); - printk(KERN_INFO "BIOS EBDA/lowmem at: %08x/%08x\n", ebda_addr, lowmem); /* Fixup: bios puts an EBDA in the top 64K segment */ /* of conventional memory, but does not adjust lowmem. */ diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index fa1d25dd83e..ac108d1fe18 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -12,9 +12,12 @@ #include <asm/sections.h> #include <asm/e820.h> #include <asm/bios_ebda.h> +#include <asm/trampoline.h> void __init i386_start_kernel(void) { + reserve_trampoline_memory(); + reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); #ifdef CONFIG_BLK_DEV_INITRD diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index d16084f9064..388e05a5fc1 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -24,6 +24,7 @@ #include <asm/kdebug.h> #include <asm/e820.h> #include <asm/bios_ebda.h> +#include <asm/trampoline.h> /* boot cpu pda */ static struct x8664_pda _boot_cpu_pda __read_mostly; @@ -120,6 +121,8 @@ void __init x86_64_start_reservations(char *real_mode_data) { copy_bootdata(__va(real_mode_data)); + reserve_trampoline_memory(); + reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); #ifdef CONFIG_BLK_DEV_INITRD diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 067d8de913f..3f0a3edf0a5 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -33,7 +33,9 @@ * HPET address is set in acpi/boot.c, when an ACPI entry exists */ unsigned long hpet_address; -unsigned long hpet_num_timers; +#ifdef CONFIG_PCI_MSI +static unsigned long hpet_num_timers; +#endif static void __iomem *hpet_virt_address; struct hpet_dev { diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index a4f93b4120c..d39918076bb 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c @@ -14,7 +14,6 @@ static struct fs_struct init_fs = INIT_FS; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); -EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */ /* * Initial thread structure. diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 9043251210f..679e7bbbbcd 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -2216,10 +2216,9 @@ static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) asmlinkage void smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; + ack_APIC_irq(); -#ifdef CONFIG_X86_64 exit_idle(); -#endif irq_enter(); me = smp_processor_id(); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 1d3d0e71b04..1df869e5bd0 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -13,6 +13,7 @@ #include <linux/seq_file.h> #include <linux/module.h> #include <linux/delay.h> +#include <linux/ftrace.h> #include <asm/uaccess.h> #include <asm/io_apic.h> #include <asm/idle.h> @@ -45,7 +46,7 @@ static inline void stack_overflow_check(struct pt_regs *regs) * SMP cross-CPU interrupts have their own specific * handlers). */ -asmlinkage unsigned int do_IRQ(struct pt_regs *regs) +asmlinkage unsigned int __irq_entry do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); struct irq_desc *desc; diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 7a385746509..37f420018a4 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -13,6 +13,7 @@ #include <linux/numa.h> #include <linux/ftrace.h> #include <linux/suspend.h> +#include <linux/gfp.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -25,15 +26,6 @@ #include <asm/system.h> #include <asm/cacheflush.h> -#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) -static u32 kexec_pgd[1024] PAGE_ALIGNED; -#ifdef CONFIG_X86_PAE -static u32 kexec_pmd0[1024] PAGE_ALIGNED; -static u32 kexec_pmd1[1024] PAGE_ALIGNED; -#endif -static u32 kexec_pte0[1024] PAGE_ALIGNED; -static u32 kexec_pte1[1024] PAGE_ALIGNED; - static void set_idt(void *newidt, __u16 limit) { struct desc_ptr curidt; @@ -76,6 +68,76 @@ static void load_segments(void) #undef __STR } +static void machine_kexec_free_page_tables(struct kimage *image) +{ + free_page((unsigned long)image->arch.pgd); +#ifdef CONFIG_X86_PAE + free_page((unsigned long)image->arch.pmd0); + free_page((unsigned long)image->arch.pmd1); +#endif + free_page((unsigned long)image->arch.pte0); + free_page((unsigned long)image->arch.pte1); +} + +static int machine_kexec_alloc_page_tables(struct kimage *image) +{ + image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL); +#ifdef CONFIG_X86_PAE + image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL); + image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL); +#endif + image->arch.pte0 = (pte_t *)get_zeroed_page(GFP_KERNEL); + image->arch.pte1 = (pte_t *)get_zeroed_page(GFP_KERNEL); + if (!image->arch.pgd || +#ifdef CONFIG_X86_PAE + !image->arch.pmd0 || !image->arch.pmd1 || +#endif + !image->arch.pte0 || !image->arch.pte1) { + machine_kexec_free_page_tables(image); + return -ENOMEM; + } + return 0; +} + +static void machine_kexec_page_table_set_one( + pgd_t *pgd, pmd_t *pmd, pte_t *pte, + unsigned long vaddr, unsigned long paddr) +{ + pud_t *pud; + + pgd += pgd_index(vaddr); +#ifdef CONFIG_X86_PAE + if (!(pgd_val(*pgd) & _PAGE_PRESENT)) + set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT)); +#endif + pud = pud_offset(pgd, vaddr); + pmd = pmd_offset(pud, vaddr); + if (!(pmd_val(*pmd) & _PAGE_PRESENT)) + set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); + pte = pte_offset_kernel(pmd, vaddr); + set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); +} + +static void machine_kexec_prepare_page_tables(struct kimage *image) +{ + void *control_page; + pmd_t *pmd = 0; + + control_page = page_address(image->control_code_page); +#ifdef CONFIG_X86_PAE + pmd = image->arch.pmd0; +#endif + machine_kexec_page_table_set_one( + image->arch.pgd, pmd, image->arch.pte0, + (unsigned long)control_page, __pa(control_page)); +#ifdef CONFIG_X86_PAE + pmd = image->arch.pmd1; +#endif + machine_kexec_page_table_set_one( + image->arch.pgd, pmd, image->arch.pte1, + __pa(control_page), __pa(control_page)); +} + /* * A architecture hook called to validate the * proposed image and prepare the control pages @@ -87,12 +149,20 @@ static void load_segments(void) * reboot code buffer to allow us to avoid allocations * later. * - * Make control page executable. + * - Make control page executable. + * - Allocate page tables + * - Setup page tables */ int machine_kexec_prepare(struct kimage *image) { + int error; + if (nx_enabled) set_pages_x(image->control_code_page, 1); + error = machine_kexec_alloc_page_tables(image); + if (error) + return error; + machine_kexec_prepare_page_tables(image); return 0; } @@ -104,6 +174,7 @@ void machine_kexec_cleanup(struct kimage *image) { if (nx_enabled) set_pages_nx(image->control_code_page, 1); + machine_kexec_free_page_tables(image); } /* @@ -150,18 +221,7 @@ void machine_kexec(struct kimage *image) relocate_kernel_ptr = control_page; page_list[PA_CONTROL_PAGE] = __pa(control_page); page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; - page_list[PA_PGD] = __pa(kexec_pgd); - page_list[VA_PGD] = (unsigned long)kexec_pgd; -#ifdef CONFIG_X86_PAE - page_list[PA_PMD_0] = __pa(kexec_pmd0); - page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; - page_list[PA_PMD_1] = __pa(kexec_pmd1); - page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; -#endif - page_list[PA_PTE_0] = __pa(kexec_pte0); - page_list[VA_PTE_0] = (unsigned long)kexec_pte0; - page_list[PA_PTE_1] = __pa(kexec_pte1); - page_list[VA_PTE_1] = (unsigned long)kexec_pte1; + page_list[PA_PGD] = __pa(image->arch.pgd); if (image->type == KEXEC_TYPE_DEFAULT) page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 5f8e5d75a25..c25fdb38229 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -10,7 +10,7 @@ * This driver allows to upgrade microcode on AMD * family 0x10 and 0x11 processors. * - * Licensed unter the terms of the GNU General Public + * Licensed under the terms of the GNU General Public * License version 2. See file COPYING for details. */ @@ -32,9 +32,9 @@ #include <linux/platform_device.h> #include <linux/pci.h> #include <linux/pci_ids.h> +#include <linux/uaccess.h> #include <asm/msr.h> -#include <asm/uaccess.h> #include <asm/processor.h> #include <asm/microcode.h> @@ -47,43 +47,38 @@ MODULE_LICENSE("GPL v2"); #define UCODE_UCODE_TYPE 0x00000001 struct equiv_cpu_entry { - unsigned int installed_cpu; - unsigned int fixed_errata_mask; - unsigned int fixed_errata_compare; - unsigned int equiv_cpu; -}; + u32 installed_cpu; + u32 fixed_errata_mask; + u32 fixed_errata_compare; + u16 equiv_cpu; + u16 res; +} __attribute__((packed)); struct microcode_header_amd { - unsigned int data_code; - unsigned int patch_id; - unsigned char mc_patch_data_id[2]; - unsigned char mc_patch_data_len; - unsigned char init_flag; - unsigned int mc_patch_data_checksum; - unsigned int nb_dev_id; - unsigned int sb_dev_id; - unsigned char processor_rev_id[2]; - unsigned char nb_rev_id; - unsigned char sb_rev_id; - unsigned char bios_api_rev; - unsigned char reserved1[3]; - unsigned int match_reg[8]; -}; + u32 data_code; + u32 patch_id; + u16 mc_patch_data_id; + u8 mc_patch_data_len; + u8 init_flag; + u32 mc_patch_data_checksum; + u32 nb_dev_id; + u32 sb_dev_id; + u16 processor_rev_id; + u8 nb_rev_id; + u8 sb_rev_id; + u8 bios_api_rev; + u8 reserved1[3]; + u32 match_reg[8]; +} __attribute__((packed)); struct microcode_amd { struct microcode_header_amd hdr; unsigned int mpb[0]; }; -#define UCODE_MAX_SIZE (2048) -#define DEFAULT_UCODE_DATASIZE (896) -#define MC_HEADER_SIZE (sizeof(struct microcode_header_amd)) -#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) -#define DWSIZE (sizeof(u32)) -/* For now we support a fixed ucode total size only */ -#define get_totalsize(mc) \ - ((((struct microcode_amd *)mc)->hdr.mc_patch_data_len * 28) \ - + MC_HEADER_SIZE) +#define UCODE_MAX_SIZE 2048 +#define UCODE_CONTAINER_SECTION_HDR 8 +#define UCODE_CONTAINER_HEADER_SIZE 12 /* serialize access to the physical write */ static DEFINE_SPINLOCK(microcode_update_lock); @@ -93,31 +88,24 @@ static struct equiv_cpu_entry *equiv_cpu_table; static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu); + u32 dummy; memset(csig, 0, sizeof(*csig)); - if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { - printk(KERN_ERR "microcode: CPU%d not a capable AMD processor\n", - cpu); + printk(KERN_WARNING "microcode: CPU%d: AMD CPU family 0x%x not " + "supported\n", cpu, c->x86); return -1; } - - asm volatile("movl %1, %%ecx; rdmsr" - : "=a" (csig->rev) - : "i" (0x0000008B) : "ecx"); - - printk(KERN_INFO "microcode: collect_cpu_info_amd : patch_id=0x%x\n", - csig->rev); - + rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); + printk(KERN_INFO "microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev); return 0; } static int get_matching_microcode(int cpu, void *mc, int rev) { struct microcode_header_amd *mc_header = mc; - struct pci_dev *nb_pci_dev, *sb_pci_dev; unsigned int current_cpu_id; - unsigned int equiv_cpu_id = 0x00; + u16 equiv_cpu_id = 0; unsigned int i = 0; BUG_ON(equiv_cpu_table == NULL); @@ -132,57 +120,25 @@ static int get_matching_microcode(int cpu, void *mc, int rev) } if (!equiv_cpu_id) { - printk(KERN_ERR "microcode: CPU%d cpu_id " - "not found in equivalent cpu table \n", cpu); + printk(KERN_WARNING "microcode: CPU%d: cpu revision " + "not listed in equivalent cpu table\n", cpu); return 0; } - if ((mc_header->processor_rev_id[0]) != (equiv_cpu_id & 0xff)) { - printk(KERN_ERR - "microcode: CPU%d patch does not match " - "(patch is %x, cpu extended is %x) \n", - cpu, mc_header->processor_rev_id[0], - (equiv_cpu_id & 0xff)); + if (mc_header->processor_rev_id != equiv_cpu_id) { + printk(KERN_ERR "microcode: CPU%d: patch mismatch " + "(processor_rev_id: %x, equiv_cpu_id: %x)\n", + cpu, mc_header->processor_rev_id, equiv_cpu_id); return 0; } - if ((mc_header->processor_rev_id[1]) != ((equiv_cpu_id >> 16) & 0xff)) { - printk(KERN_ERR "microcode: CPU%d patch does not match " - "(patch is %x, cpu base id is %x) \n", - cpu, mc_header->processor_rev_id[1], - ((equiv_cpu_id >> 16) & 0xff)); - + /* ucode might be chipset specific -- currently we don't support this */ + if (mc_header->nb_dev_id || mc_header->sb_dev_id) { + printk(KERN_ERR "microcode: CPU%d: loading of chipset " + "specific code not yet supported\n", cpu); return 0; } - /* ucode may be northbridge specific */ - if (mc_header->nb_dev_id) { - nb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD, - (mc_header->nb_dev_id & 0xff), - NULL); - if ((!nb_pci_dev) || - (mc_header->nb_rev_id != nb_pci_dev->revision)) { - printk(KERN_ERR "microcode: CPU%d NB mismatch \n", cpu); - pci_dev_put(nb_pci_dev); - return 0; - } - pci_dev_put(nb_pci_dev); - } - - /* ucode may be southbridge specific */ - if (mc_header->sb_dev_id) { - sb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD, - (mc_header->sb_dev_id & 0xff), - NULL); - if ((!sb_pci_dev) || - (mc_header->sb_rev_id != sb_pci_dev->revision)) { - printk(KERN_ERR "microcode: CPU%d SB mismatch \n", cpu); - pci_dev_put(sb_pci_dev); - return 0; - } - pci_dev_put(sb_pci_dev); - } - if (mc_header->patch_id <= rev) return 0; @@ -192,12 +148,10 @@ static int get_matching_microcode(int cpu, void *mc, int rev) static void apply_microcode_amd(int cpu) { unsigned long flags; - unsigned int eax, edx; - unsigned int rev; + u32 rev, dummy; int cpu_num = raw_smp_processor_id(); struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; struct microcode_amd *mc_amd = uci->mc; - unsigned long addr; /* We should bind the task to the CPU */ BUG_ON(cpu_num != cpu); @@ -206,42 +160,34 @@ static void apply_microcode_amd(int cpu) return; spin_lock_irqsave(µcode_update_lock, flags); - - addr = (unsigned long)&mc_amd->hdr.data_code; - edx = (unsigned int)(((unsigned long)upper_32_bits(addr))); - eax = (unsigned int)(((unsigned long)lower_32_bits(addr))); - - asm volatile("movl %0, %%ecx; wrmsr" : - : "i" (0xc0010020), "a" (eax), "d" (edx) : "ecx"); - + wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); /* get patch id after patching */ - asm volatile("movl %1, %%ecx; rdmsr" - : "=a" (rev) - : "i" (0x0000008B) : "ecx"); - + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); spin_unlock_irqrestore(µcode_update_lock, flags); /* check current patch id and patch's id for match */ if (rev != mc_amd->hdr.patch_id) { - printk(KERN_ERR "microcode: CPU%d update from revision " - "0x%x to 0x%x failed\n", cpu_num, - mc_amd->hdr.patch_id, rev); + printk(KERN_ERR "microcode: CPU%d: update failed " + "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); return; } - printk(KERN_INFO "microcode: CPU%d updated from revision " - "0x%x to 0x%x \n", - cpu_num, uci->cpu_sig.rev, mc_amd->hdr.patch_id); + printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n", + cpu, rev); uci->cpu_sig.rev = rev; } -static void * get_next_ucode(u8 *buf, unsigned int size, - int (*get_ucode_data)(void *, const void *, size_t), - unsigned int *mc_size) +static int get_ucode_data(void *to, const u8 *from, size_t n) +{ + memcpy(to, from, n); + return 0; +} + +static void *get_next_ucode(const u8 *buf, unsigned int size, + unsigned int *mc_size) { unsigned int total_size; -#define UCODE_CONTAINER_SECTION_HDR 8 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; void *mc; @@ -249,39 +195,37 @@ static void * get_next_ucode(u8 *buf, unsigned int size, return NULL; if (section_hdr[0] != UCODE_UCODE_TYPE) { - printk(KERN_ERR "microcode: error! " - "Wrong microcode payload type field\n"); + printk(KERN_ERR "microcode: error: invalid type field in " + "container file section header\n"); return NULL; } total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); - printk(KERN_INFO "microcode: size %u, total_size %u\n", - size, total_size); + printk(KERN_DEBUG "microcode: size %u, total_size %u\n", + size, total_size); if (total_size > size || total_size > UCODE_MAX_SIZE) { - printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); + printk(KERN_ERR "microcode: error: size mismatch\n"); return NULL; } mc = vmalloc(UCODE_MAX_SIZE); if (mc) { memset(mc, 0, UCODE_MAX_SIZE); - if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size)) { + if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, + total_size)) { vfree(mc); mc = NULL; } else *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR; } -#undef UCODE_CONTAINER_SECTION_HDR return mc; } -static int install_equiv_cpu_table(u8 *buf, - int (*get_ucode_data)(void *, const void *, size_t)) +static int install_equiv_cpu_table(const u8 *buf) { -#define UCODE_CONTAINER_HEADER_SIZE 12 u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE]; unsigned int *buf_pos = (unsigned int *)container_hdr; unsigned long size; @@ -292,14 +236,15 @@ static int install_equiv_cpu_table(u8 *buf, size = buf_pos[2]; if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { - printk(KERN_ERR "microcode: error! " - "Wrong microcode equivalnet cpu table\n"); + printk(KERN_ERR "microcode: error: invalid type field in " + "container file section header\n"); return 0; } equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); if (!equiv_cpu_table) { - printk(KERN_ERR "microcode: error, can't allocate memory for equiv CPU table\n"); + printk(KERN_ERR "microcode: failed to allocate " + "equivalent CPU table\n"); return 0; } @@ -310,7 +255,6 @@ static int install_equiv_cpu_table(u8 *buf, } return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */ -#undef UCODE_CONTAINER_HEADER_SIZE } static void free_equiv_cpu_table(void) @@ -321,18 +265,20 @@ static void free_equiv_cpu_table(void) } } -static int generic_load_microcode(int cpu, void *data, size_t size, - int (*get_ucode_data)(void *, const void *, size_t)) +static int generic_load_microcode(int cpu, const u8 *data, size_t size) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - u8 *ucode_ptr = data, *new_mc = NULL, *mc; + const u8 *ucode_ptr = data; + void *new_mc = NULL; + void *mc; int new_rev = uci->cpu_sig.rev; unsigned int leftover; unsigned long offset; - offset = install_equiv_cpu_table(ucode_ptr, get_ucode_data); + offset = install_equiv_cpu_table(ucode_ptr); if (!offset) { - printk(KERN_ERR "microcode: installing equivalent cpu table failed\n"); + printk(KERN_ERR "microcode: failed to create " + "equivalent cpu table\n"); return -EINVAL; } @@ -343,7 +289,7 @@ static int generic_load_microcode(int cpu, void *data, size_t size, unsigned int uninitialized_var(mc_size); struct microcode_header_amd *mc_header; - mc = get_next_ucode(ucode_ptr, leftover, get_ucode_data, &mc_size); + mc = get_next_ucode(ucode_ptr, leftover, &mc_size); if (!mc) break; @@ -353,7 +299,7 @@ static int generic_load_microcode(int cpu, void *data, size_t size, vfree(new_mc); new_rev = mc_header->patch_id; new_mc = mc; - } else + } else vfree(mc); ucode_ptr += mc_size; @@ -365,9 +311,9 @@ static int generic_load_microcode(int cpu, void *data, size_t size, if (uci->mc) vfree(uci->mc); uci->mc = new_mc; - pr_debug("microcode: CPU%d found a matching microcode update with" - " version 0x%x (current=0x%x)\n", - cpu, new_rev, uci->cpu_sig.rev); + pr_debug("microcode: CPU%d found a matching microcode " + "update with version 0x%x (current=0x%x)\n", + cpu, new_rev, uci->cpu_sig.rev); } else vfree(new_mc); } @@ -377,12 +323,6 @@ static int generic_load_microcode(int cpu, void *data, size_t size, return (int)leftover; } -static int get_ucode_fw(void *to, const void *from, size_t n) -{ - memcpy(to, from, n); - return 0; -} - static int request_microcode_fw(int cpu, struct device *device) { const char *fw_name = "amd-ucode/microcode_amd.bin"; @@ -394,12 +334,11 @@ static int request_microcode_fw(int cpu, struct device *device) ret = request_firmware(&firmware, fw_name, device); if (ret) { - printk(KERN_ERR "microcode: ucode data file %s load failed\n", fw_name); + printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); return ret; } - ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size, - &get_ucode_fw); + ret = generic_load_microcode(cpu, firmware->data, firmware->size); release_firmware(firmware); @@ -408,8 +347,8 @@ static int request_microcode_fw(int cpu, struct device *device) static int request_microcode_user(int cpu, const void __user *buf, size_t size) { - printk(KERN_WARNING "microcode: AMD microcode update via /dev/cpu/microcode" - "is not supported\n"); + printk(KERN_INFO "microcode: AMD microcode update via " + "/dev/cpu/microcode not supported\n"); return -1; } @@ -433,3 +372,4 @@ struct microcode_ops * __init init_amd_microcode(void) { return µcode_amd_ops; } + diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 82fb2809ce3..c9b721ba968 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -99,7 +99,7 @@ MODULE_LICENSE("GPL"); #define MICROCODE_VERSION "2.00" -struct microcode_ops *microcode_ops; +static struct microcode_ops *microcode_ops; /* no concurrent ->write()s are allowed on /dev/cpu/microcode */ static DEFINE_MUTEX(microcode_mutex); @@ -203,7 +203,7 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); #endif /* fake device for request_firmware */ -struct platform_device *microcode_pdev; +static struct platform_device *microcode_pdev; static ssize_t reload_store(struct sys_device *dev, struct sysdev_attribute *attr, @@ -272,13 +272,18 @@ static struct attribute_group mc_attr_group = { .name = "microcode", }; -static void microcode_fini_cpu(int cpu) +static void __microcode_fini_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - mutex_lock(µcode_mutex); microcode_ops->microcode_fini_cpu(cpu); uci->valid = 0; +} + +static void microcode_fini_cpu(int cpu) +{ + mutex_lock(µcode_mutex); + __microcode_fini_cpu(cpu); mutex_unlock(µcode_mutex); } @@ -306,12 +311,16 @@ static int microcode_resume_cpu(int cpu) * to this cpu (a bit of paranoia): */ if (microcode_ops->collect_cpu_info(cpu, &nsig)) { - microcode_fini_cpu(cpu); + __microcode_fini_cpu(cpu); + printk(KERN_ERR "failed to collect_cpu_info for resuming cpu #%d\n", + cpu); return -1; } - if (memcmp(&nsig, &uci->cpu_sig, sizeof(nsig))) { - microcode_fini_cpu(cpu); + if ((nsig.sig != uci->cpu_sig.sig) || (nsig.pf != uci->cpu_sig.pf)) { + __microcode_fini_cpu(cpu); + printk(KERN_ERR "cached ucode doesn't match the resuming cpu #%d\n", + cpu); /* Should we look for a new ucode here? */ return 1; } @@ -319,7 +328,7 @@ static int microcode_resume_cpu(int cpu) return 0; } -void microcode_update_cpu(int cpu) +static void microcode_update_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; int err = 0; diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 622dc4a2178..b7f4c929e61 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -155,6 +155,7 @@ static DEFINE_SPINLOCK(microcode_update_lock); static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu_num); + unsigned long flags; unsigned int val[2]; memset(csig, 0, sizeof(*csig)); @@ -174,11 +175,16 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) csig->pf = 1 << ((val[1] >> 18) & 7); } + /* serialize access to the physical write to MSR 0x79 */ + spin_lock_irqsave(µcode_update_lock, flags); + wrmsr(MSR_IA32_UCODE_REV, 0, 0); /* see notes above for revision 1.07. Apparent chip bug */ sync_core(); /* get the current revision from MSR 0x8B */ rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); + spin_unlock_irqrestore(µcode_update_lock, flags); + pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", csig->sig, csig->pf, csig->rev); @@ -465,7 +471,7 @@ static void microcode_fini_cpu(int cpu) uci->mc = NULL; } -struct microcode_ops microcode_intel_ops = { +static struct microcode_ops microcode_intel_ops = { .request_microcode_user = request_microcode_user, .request_microcode_fw = request_microcode_fw, .collect_cpu_info = collect_cpu_info, diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 0f4c1fd5a1f..45e3b69808b 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -586,26 +586,23 @@ static void __init __get_smp_config(unsigned int early) { struct intel_mp_floating *mpf = mpf_found; - if (x86_quirks->mach_get_smp_config) { - if (x86_quirks->mach_get_smp_config(early)) - return; - } + if (!mpf) + return; + if (acpi_lapic && early) return; + /* - * ACPI supports both logical (e.g. Hyper-Threading) and physical - * processors, where MPS only supports physical. + * MPS doesn't support hyperthreading, aka only have + * thread 0 apic id in MPS table */ - if (acpi_lapic && acpi_ioapic) { - printk(KERN_INFO "Using ACPI (MADT) for SMP configuration " - "information\n"); + if (acpi_lapic && acpi_ioapic) return; - } else if (acpi_lapic) - printk(KERN_INFO "Using ACPI for processor (LAPIC) " - "configuration information\n"); - if (!mpf) - return; + if (x86_quirks->mach_get_smp_config) { + if (x86_quirks->mach_get_smp_config(early)) + return; + } printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 2c97f07f1c2..8bd1bf9622a 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -131,6 +131,11 @@ static void report_broken_nmi(int cpu, int *prev_nmi_count) atomic_dec(&nmi_active); } +static void __acpi_nmi_disable(void *__unused) +{ + apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); +} + int __init check_nmi_watchdog(void) { unsigned int *prev_nmi_count; @@ -179,8 +184,12 @@ int __init check_nmi_watchdog(void) kfree(prev_nmi_count); return 0; error: - if (nmi_watchdog == NMI_IO_APIC && !timer_through_8259) - disable_8259A_irq(0); + if (nmi_watchdog == NMI_IO_APIC) { + if (!timer_through_8259) + disable_8259A_irq(0); + on_each_cpu(__acpi_nmi_disable, NULL, 1); + } + #ifdef CONFIG_X86_32 timer_ack = 0; #endif @@ -199,12 +208,17 @@ static int __init setup_nmi_watchdog(char *str) ++str; } - get_option(&str, &nmi); - - if (nmi >= NMI_INVALID) - return 0; + if (!strncmp(str, "lapic", 5)) + nmi_watchdog = NMI_LOCAL_APIC; + else if (!strncmp(str, "ioapic", 6)) + nmi_watchdog = NMI_IO_APIC; + else { + get_option(&str, &nmi); + if (nmi >= NMI_INVALID) + return 0; + nmi_watchdog = nmi; + } - nmi_watchdog = nmi; return 1; } __setup("nmi_watchdog=", setup_nmi_watchdog); @@ -285,11 +299,6 @@ void acpi_nmi_enable(void) on_each_cpu(__acpi_nmi_enable, NULL, 1); } -static void __acpi_nmi_disable(void *__unused) -{ - apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); -} - /* * Disable timer based NMIs on all CPUs: */ @@ -340,6 +349,8 @@ void stop_apic_nmi_watchdog(void *unused) return; if (nmi_watchdog == NMI_LOCAL_APIC) lapic_watchdog_stop(); + else + __acpi_nmi_disable(NULL); __get_cpu_var(wd_enabled) = 0; atomic_dec(&nmi_active); } @@ -465,6 +476,24 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) #ifdef CONFIG_SYSCTL +static void enable_ioapic_nmi_watchdog_single(void *unused) +{ + __get_cpu_var(wd_enabled) = 1; + atomic_inc(&nmi_active); + __acpi_nmi_enable(NULL); +} + +static void enable_ioapic_nmi_watchdog(void) +{ + on_each_cpu(enable_ioapic_nmi_watchdog_single, NULL, 1); + touch_nmi_watchdog(); +} + +static void disable_ioapic_nmi_watchdog(void) +{ + on_each_cpu(stop_apic_nmi_watchdog, NULL, 1); +} + static int __init setup_unknown_nmi_panic(char *str) { unknown_nmi_panic = 1; @@ -507,6 +536,11 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, enable_lapic_nmi_watchdog(); else disable_lapic_nmi_watchdog(); + } else if (nmi_watchdog == NMI_IO_APIC) { + if (nmi_watchdog_enabled) + enable_ioapic_nmi_watchdog(); + else + disable_ioapic_nmi_watchdog(); } else { printk(KERN_WARNING "NMI watchdog doesn't know what hardware to touch\n"); diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c index 4caff39078e..0deea37a53c 100644 --- a/arch/x86/kernel/numaq_32.c +++ b/arch/x86/kernel/numaq_32.c @@ -31,7 +31,7 @@ #include <asm/numaq.h> #include <asm/topology.h> #include <asm/processor.h> -#include <asm/mpspec.h> +#include <asm/genapic.h> #include <asm/e820.h> #include <asm/setup.h> @@ -235,6 +235,13 @@ static int __init numaq_setup_ioapic_ids(void) return 1; } +static int __init numaq_update_genapic(void) +{ + genapic->wakeup_cpu = wakeup_secondary_cpu_via_nmi; + + return 0; +} + static struct x86_quirks numaq_x86_quirks __initdata = { .arch_pre_time_init = numaq_pre_time_init, .arch_time_init = NULL, @@ -250,6 +257,7 @@ static struct x86_quirks numaq_x86_quirks __initdata = { .mpc_oem_pci_bus = mpc_oem_pci_bus, .smp_read_mpc_oem = smp_read_mpc_oem, .setup_ioapic_ids = numaq_setup_ioapic_ids, + .update_genapic = numaq_update_genapic, }; void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem, diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 19262482021..7a3dfceb90e 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -6,6 +6,7 @@ #include <asm/proto.h> #include <asm/dma.h> #include <asm/iommu.h> +#include <asm/gart.h> #include <asm/calgary.h> #include <asm/amd_iommu.h> @@ -30,11 +31,6 @@ int no_iommu __read_mostly; /* Set this to 1 if there is a HW IOMMU in the system */ int iommu_detected __read_mostly = 0; -/* This tells the BIO block layer to assume merging. Default to off - because we cannot guarantee merging later. */ -int iommu_bio_merge __read_mostly = 0; -EXPORT_SYMBOL(iommu_bio_merge); - dma_addr_t bad_dma_address __read_mostly = 0; EXPORT_SYMBOL(bad_dma_address); @@ -188,7 +184,6 @@ static __init int iommu_setup(char *p) } if (!strncmp(p, "biomerge", 8)) { - iommu_bio_merge = 4096; iommu_merge = 1; force_iommu = 1; } @@ -300,8 +295,8 @@ fs_initcall(pci_iommu_init); static __devinit void via_no_dac(struct pci_dev *dev) { if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { - printk(KERN_INFO "PCI: VIA PCI bridge detected." - "Disabling DAC.\n"); + printk(KERN_INFO + "PCI: VIA PCI bridge detected. Disabling DAC.\n"); forbid_dac = 1; } } diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index ba7ad83e20a..a35eaa379ff 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -745,10 +745,8 @@ void __init gart_iommu_init(void) unsigned long scratch; long i; - if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) { - printk(KERN_INFO "PCI-GART: No AMD GART found.\n"); + if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) return; - } #ifndef CONFIG_AGP_AMD64 no_agp = 1; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index c622772744d..e68bb9e3086 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -1,13 +1,16 @@ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/mm.h> +#include <asm/idle.h> #include <linux/smp.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/module.h> #include <linux/pm.h> #include <linux/clockchips.h> +#include <linux/ftrace.h> #include <asm/system.h> +#include <asm/apic.h> unsigned long idle_halt; EXPORT_SYMBOL(idle_halt); @@ -100,6 +103,9 @@ static inline int hlt_use_halt(void) void default_idle(void) { if (hlt_use_halt()) { + struct power_trace it; + + trace_power_start(&it, POWER_CSTATE, 1); current_thread_info()->status &= ~TS_POLLING; /* * TS_POLLING-cleared state must be visible before we @@ -112,6 +118,7 @@ void default_idle(void) else local_irq_enable(); current_thread_info()->status |= TS_POLLING; + trace_power_end(&it); } else { local_irq_enable(); /* loop is done by the caller */ @@ -122,6 +129,21 @@ void default_idle(void) EXPORT_SYMBOL(default_idle); #endif +void stop_this_cpu(void *dummy) +{ + local_irq_disable(); + /* + * Remove this CPU: + */ + cpu_clear(smp_processor_id(), cpu_online_map); + disable_local_APIC(); + + for (;;) { + if (hlt_works(smp_processor_id())) + halt(); + } +} + static void do_nothing(void *unused) { } @@ -154,24 +176,31 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); */ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { + struct power_trace it; + + trace_power_start(&it, POWER_CSTATE, (ax>>4)+1); if (!need_resched()) { __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); if (!need_resched()) __mwait(ax, cx); } + trace_power_end(&it); } /* Default MONITOR/MWAIT with no hints, used for default C1 state */ static void mwait_idle(void) { + struct power_trace it; if (!need_resched()) { + trace_power_start(&it, POWER_CSTATE, 1); __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); if (!need_resched()) __sti_mwait(0, 0); else local_irq_enable(); + trace_power_end(&it); } else local_irq_enable(); } @@ -183,9 +212,13 @@ static void mwait_idle(void) */ static void poll_idle(void) { + struct power_trace it; + + trace_power_start(&it, POWER_CSTATE, 0); local_irq_enable(); while (!need_resched()) cpu_relax(); + trace_power_end(&it); } /* @@ -270,7 +303,7 @@ static void c1e_idle(void) rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); if (lo & K8_INTP_C1E_ACTIVE_MASK) { c1e_detected = 1; - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) mark_tsc_unstable("TSC halt in AMD C1E"); printk(KERN_INFO "System has AMD C1E enabled\n"); set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 0a1302fe6d4..3ba155d2488 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -38,6 +38,7 @@ #include <linux/percpu.h> #include <linux/prctl.h> #include <linux/dmi.h> +#include <linux/ftrace.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -59,6 +60,7 @@ #include <asm/idle.h> #include <asm/syscalls.h> #include <asm/smp.h> +#include <asm/ds.h> asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -250,14 +252,8 @@ void exit_thread(void) tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; put_cpu(); } -#ifdef CONFIG_X86_DS - /* Free any DS contexts that have not been properly released. */ - if (unlikely(current->thread.ds_ctx)) { - /* we clear debugctl to make sure DS is not used. */ - update_debugctlmsr(0); - ds_free(current->thread.ds_ctx); - } -#endif /* CONFIG_X86_DS */ + + ds_exit_thread(current); } void flush_thread(void) @@ -339,6 +335,12 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; } + + ds_copy_thread(p, current); + + clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); + p->thread.debugctlmsr = 0; + return err; } @@ -419,48 +421,19 @@ int set_tsc_mode(unsigned int val) return 0; } -#ifdef CONFIG_X86_DS -static int update_debugctl(struct thread_struct *prev, - struct thread_struct *next, unsigned long debugctl) -{ - unsigned long ds_prev = 0; - unsigned long ds_next = 0; - - if (prev->ds_ctx) - ds_prev = (unsigned long)prev->ds_ctx->ds; - if (next->ds_ctx) - ds_next = (unsigned long)next->ds_ctx->ds; - - if (ds_next != ds_prev) { - /* we clear debugctl to make sure DS - * is not in use when we change it */ - debugctl = 0; - update_debugctlmsr(0); - wrmsr(MSR_IA32_DS_AREA, ds_next, 0); - } - return debugctl; -} -#else -static int update_debugctl(struct thread_struct *prev, - struct thread_struct *next, unsigned long debugctl) -{ - return debugctl; -} -#endif /* CONFIG_X86_DS */ - static noinline void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, struct tss_struct *tss) { struct thread_struct *prev, *next; - unsigned long debugctl; prev = &prev_p->thread; next = &next_p->thread; - debugctl = update_debugctl(prev, next, prev->debugctlmsr); - - if (next->debugctlmsr != debugctl) + if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || + test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) + ds_switch_to(prev_p, next_p); + else if (next->debugctlmsr != prev->debugctlmsr) update_debugctlmsr(next->debugctlmsr); if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { @@ -482,15 +455,6 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, hard_enable_TSC(); } -#ifdef CONFIG_X86_PTRACE_BTS - if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) - ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); - - if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) - ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); -#endif /* CONFIG_X86_PTRACE_BTS */ - - if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { /* * Disable the bitmap via an invalid offset. We still cache @@ -548,7 +512,8 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, * the task-switch, and shows up in ret_from_fork in entry.S, * for example. */ -struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) +__notrace_funcgraph struct task_struct * +__switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index c958120fb1b..416fb9282f4 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -39,6 +39,7 @@ #include <linux/prctl.h> #include <linux/uaccess.h> #include <linux/io.h> +#include <linux/ftrace.h> #include <asm/pgtable.h> #include <asm/system.h> @@ -52,6 +53,7 @@ #include <asm/ia32.h> #include <asm/idle.h> #include <asm/syscalls.h> +#include <asm/ds.h> asmlinkage extern void ret_from_fork(void); @@ -235,14 +237,8 @@ void exit_thread(void) t->io_bitmap_max = 0; put_cpu(); } -#ifdef CONFIG_X86_DS - /* Free any DS contexts that have not been properly released. */ - if (unlikely(t->ds_ctx)) { - /* we clear debugctl to make sure DS is not used. */ - update_debugctlmsr(0); - ds_free(t->ds_ctx); - } -#endif /* CONFIG_X86_DS */ + + ds_exit_thread(current); } void flush_thread(void) @@ -372,6 +368,12 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, if (err) goto out; } + + ds_copy_thread(p, me); + + clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); + p->thread.debugctlmsr = 0; + err = 0; out: if (err && p->thread.io_bitmap_ptr) { @@ -470,35 +472,14 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, struct tss_struct *tss) { struct thread_struct *prev, *next; - unsigned long debugctl; prev = &prev_p->thread, next = &next_p->thread; - debugctl = prev->debugctlmsr; - -#ifdef CONFIG_X86_DS - { - unsigned long ds_prev = 0, ds_next = 0; - - if (prev->ds_ctx) - ds_prev = (unsigned long)prev->ds_ctx->ds; - if (next->ds_ctx) - ds_next = (unsigned long)next->ds_ctx->ds; - - if (ds_next != ds_prev) { - /* - * We clear debugctl to make sure DS - * is not in use when we change it: - */ - debugctl = 0; - update_debugctlmsr(0); - wrmsrl(MSR_IA32_DS_AREA, ds_next); - } - } -#endif /* CONFIG_X86_DS */ - - if (next->debugctlmsr != debugctl) + if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || + test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) + ds_switch_to(prev_p, next_p); + else if (next->debugctlmsr != prev->debugctlmsr) update_debugctlmsr(next->debugctlmsr); if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { @@ -533,14 +514,6 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, */ memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); } - -#ifdef CONFIG_X86_PTRACE_BTS - if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) - ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); - - if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) - ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); -#endif /* CONFIG_X86_PTRACE_BTS */ } /* @@ -551,8 +524,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, * - could test fs/gs bitsliced * * Kprobes not supported here. Set the probe on schedule instead. + * Function graph tracer not supported too. */ -struct task_struct * +__notrace_funcgraph struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 0a6d8c12e10..0a5df5f82fb 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -581,158 +581,91 @@ static int ioperm_get(struct task_struct *target, } #ifdef CONFIG_X86_PTRACE_BTS -/* - * The configuration for a particular BTS hardware implementation. - */ -struct bts_configuration { - /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */ - unsigned char sizeof_bts; - /* the size of a field in the BTS record in bytes */ - unsigned char sizeof_field; - /* a bitmask to enable/disable BTS in DEBUGCTL MSR */ - unsigned long debugctl_mask; -}; -static struct bts_configuration bts_cfg; - -#define BTS_MAX_RECORD_SIZE (8 * 3) - - -/* - * Branch Trace Store (BTS) uses the following format. Different - * architectures vary in the size of those fields. - * - source linear address - * - destination linear address - * - flags - * - * Later architectures use 64bit pointers throughout, whereas earlier - * architectures use 32bit pointers in 32bit mode. - * - * We compute the base address for the first 8 fields based on: - * - the field size stored in the DS configuration - * - the relative field position - * - * In order to store additional information in the BTS buffer, we use - * a special source address to indicate that the record requires - * special interpretation. - * - * Netburst indicated via a bit in the flags field whether the branch - * was predicted; this is ignored. - */ - -enum bts_field { - bts_from = 0, - bts_to, - bts_flags, - - bts_escape = (unsigned long)-1, - bts_qual = bts_to, - bts_jiffies = bts_flags -}; - -static inline unsigned long bts_get(const char *base, enum bts_field field) -{ - base += (bts_cfg.sizeof_field * field); - return *(unsigned long *)base; -} - -static inline void bts_set(char *base, enum bts_field field, unsigned long val) -{ - base += (bts_cfg.sizeof_field * field);; - (*(unsigned long *)base) = val; -} - -/* - * Translate a BTS record from the raw format into the bts_struct format - * - * out (out): bts_struct interpretation - * raw: raw BTS record - */ -static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw) -{ - memset(out, 0, sizeof(*out)); - if (bts_get(raw, bts_from) == bts_escape) { - out->qualifier = bts_get(raw, bts_qual); - out->variant.jiffies = bts_get(raw, bts_jiffies); - } else { - out->qualifier = BTS_BRANCH; - out->variant.lbr.from_ip = bts_get(raw, bts_from); - out->variant.lbr.to_ip = bts_get(raw, bts_to); - } -} - static int ptrace_bts_read_record(struct task_struct *child, size_t index, struct bts_struct __user *out) { - struct bts_struct ret; - const void *bts_record; - size_t bts_index, bts_end; + const struct bts_trace *trace; + struct bts_struct bts; + const unsigned char *at; int error; - error = ds_get_bts_end(child, &bts_end); - if (error < 0) - return error; - - if (bts_end <= index) - return -EINVAL; + trace = ds_read_bts(child->bts); + if (!trace) + return -EPERM; - error = ds_get_bts_index(child, &bts_index); - if (error < 0) - return error; + at = trace->ds.top - ((index + 1) * trace->ds.size); + if ((void *)at < trace->ds.begin) + at += (trace->ds.n * trace->ds.size); - /* translate the ptrace bts index into the ds bts index */ - bts_index += bts_end - (index + 1); - if (bts_end <= bts_index) - bts_index -= bts_end; + if (!trace->read) + return -EOPNOTSUPP; - error = ds_access_bts(child, bts_index, &bts_record); + error = trace->read(child->bts, at, &bts); if (error < 0) return error; - ptrace_bts_translate_record(&ret, bts_record); - - if (copy_to_user(out, &ret, sizeof(ret))) + if (copy_to_user(out, &bts, sizeof(bts))) return -EFAULT; - return sizeof(ret); + return sizeof(bts); } static int ptrace_bts_drain(struct task_struct *child, long size, struct bts_struct __user *out) { - struct bts_struct ret; - const unsigned char *raw; - size_t end, i; - int error; + const struct bts_trace *trace; + const unsigned char *at; + int error, drained = 0; - error = ds_get_bts_index(child, &end); - if (error < 0) - return error; + trace = ds_read_bts(child->bts); + if (!trace) + return -EPERM; - if (size < (end * sizeof(struct bts_struct))) + if (!trace->read) + return -EOPNOTSUPP; + + if (size < (trace->ds.top - trace->ds.begin)) return -EIO; - error = ds_access_bts(child, 0, (const void **)&raw); - if (error < 0) - return error; + for (at = trace->ds.begin; (void *)at < trace->ds.top; + out++, drained++, at += trace->ds.size) { + struct bts_struct bts; + int error; - for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) { - ptrace_bts_translate_record(&ret, raw); + error = trace->read(child->bts, at, &bts); + if (error < 0) + return error; - if (copy_to_user(out, &ret, sizeof(ret))) + if (copy_to_user(out, &bts, sizeof(bts))) return -EFAULT; } - error = ds_clear_bts(child); + memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); + + error = ds_reset_bts(child->bts); if (error < 0) return error; - return end; + return drained; } -static void ptrace_bts_ovfl(struct task_struct *child) +static int ptrace_bts_allocate_buffer(struct task_struct *child, size_t size) { - send_sig(child->thread.bts_ovfl_signal, child, 0); + child->bts_buffer = alloc_locked_buffer(size); + if (!child->bts_buffer) + return -ENOMEM; + + child->bts_size = size; + + return 0; +} + +static void ptrace_bts_free_buffer(struct task_struct *child) +{ + free_locked_buffer(child->bts_buffer, child->bts_size); + child->bts_buffer = NULL; + child->bts_size = 0; } static int ptrace_bts_config(struct task_struct *child, @@ -740,114 +673,86 @@ static int ptrace_bts_config(struct task_struct *child, const struct ptrace_bts_config __user *ucfg) { struct ptrace_bts_config cfg; - int error = 0; - - error = -EOPNOTSUPP; - if (!bts_cfg.sizeof_bts) - goto errout; + unsigned int flags = 0; - error = -EIO; if (cfg_size < sizeof(cfg)) - goto errout; + return -EIO; - error = -EFAULT; if (copy_from_user(&cfg, ucfg, sizeof(cfg))) - goto errout; + return -EFAULT; - error = -EINVAL; - if ((cfg.flags & PTRACE_BTS_O_SIGNAL) && - !(cfg.flags & PTRACE_BTS_O_ALLOC)) - goto errout; + if (child->bts) { + ds_release_bts(child->bts); + child->bts = NULL; + } - if (cfg.flags & PTRACE_BTS_O_ALLOC) { - ds_ovfl_callback_t ovfl = NULL; - unsigned int sig = 0; + if (cfg.flags & PTRACE_BTS_O_SIGNAL) { + if (!cfg.signal) + return -EINVAL; - /* we ignore the error in case we were not tracing child */ - (void)ds_release_bts(child); + return -EOPNOTSUPP; - if (cfg.flags & PTRACE_BTS_O_SIGNAL) { - if (!cfg.signal) - goto errout; + child->thread.bts_ovfl_signal = cfg.signal; + } - sig = cfg.signal; - ovfl = ptrace_bts_ovfl; - } + if ((cfg.flags & PTRACE_BTS_O_ALLOC) && + (cfg.size != child->bts_size)) { + int error; - error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl); - if (error < 0) - goto errout; + ptrace_bts_free_buffer(child); - child->thread.bts_ovfl_signal = sig; + error = ptrace_bts_allocate_buffer(child, cfg.size); + if (error < 0) + return error; } - error = -EINVAL; - if (!child->thread.ds_ctx && cfg.flags) - goto errout; - if (cfg.flags & PTRACE_BTS_O_TRACE) - child->thread.debugctlmsr |= bts_cfg.debugctl_mask; - else - child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; + flags |= BTS_USER; if (cfg.flags & PTRACE_BTS_O_SCHED) - set_tsk_thread_flag(child, TIF_BTS_TRACE_TS); - else - clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); + flags |= BTS_TIMESTAMPS; - error = sizeof(cfg); + child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size, + /* ovfl = */ NULL, /* th = */ (size_t)-1, + flags); + if (IS_ERR(child->bts)) { + int error = PTR_ERR(child->bts); -out: - if (child->thread.debugctlmsr) - set_tsk_thread_flag(child, TIF_DEBUGCTLMSR); - else - clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + ptrace_bts_free_buffer(child); + child->bts = NULL; - return error; + return error; + } -errout: - child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; - clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); - goto out; + return sizeof(cfg); } static int ptrace_bts_status(struct task_struct *child, long cfg_size, struct ptrace_bts_config __user *ucfg) { + const struct bts_trace *trace; struct ptrace_bts_config cfg; - size_t end; - const void *base, *max; - int error; if (cfg_size < sizeof(cfg)) return -EIO; - error = ds_get_bts_end(child, &end); - if (error < 0) - return error; - - error = ds_access_bts(child, /* index = */ 0, &base); - if (error < 0) - return error; - - error = ds_access_bts(child, /* index = */ end, &max); - if (error < 0) - return error; + trace = ds_read_bts(child->bts); + if (!trace) + return -EPERM; memset(&cfg, 0, sizeof(cfg)); - cfg.size = (max - base); + cfg.size = trace->ds.end - trace->ds.begin; cfg.signal = child->thread.bts_ovfl_signal; cfg.bts_size = sizeof(struct bts_struct); if (cfg.signal) cfg.flags |= PTRACE_BTS_O_SIGNAL; - if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) && - child->thread.debugctlmsr & bts_cfg.debugctl_mask) + if (trace->ds.flags & BTS_USER) cfg.flags |= PTRACE_BTS_O_TRACE; - if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS)) + if (trace->ds.flags & BTS_TIMESTAMPS) cfg.flags |= PTRACE_BTS_O_SCHED; if (copy_to_user(ucfg, &cfg, sizeof(cfg))) @@ -856,110 +761,77 @@ static int ptrace_bts_status(struct task_struct *child, return sizeof(cfg); } -static int ptrace_bts_write_record(struct task_struct *child, - const struct bts_struct *in) +static int ptrace_bts_clear(struct task_struct *child) { - unsigned char bts_record[BTS_MAX_RECORD_SIZE]; + const struct bts_trace *trace; - BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts); + trace = ds_read_bts(child->bts); + if (!trace) + return -EPERM; - memset(bts_record, 0, bts_cfg.sizeof_bts); - switch (in->qualifier) { - case BTS_INVALID: - break; + memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); - case BTS_BRANCH: - bts_set(bts_record, bts_from, in->variant.lbr.from_ip); - bts_set(bts_record, bts_to, in->variant.lbr.to_ip); - break; + return ds_reset_bts(child->bts); +} - case BTS_TASK_ARRIVES: - case BTS_TASK_DEPARTS: - bts_set(bts_record, bts_from, bts_escape); - bts_set(bts_record, bts_qual, in->qualifier); - bts_set(bts_record, bts_jiffies, in->variant.jiffies); - break; +static int ptrace_bts_size(struct task_struct *child) +{ + const struct bts_trace *trace; - default: - return -EINVAL; - } + trace = ds_read_bts(child->bts); + if (!trace) + return -EPERM; - /* The writing task will be the switched-to task on a context - * switch. It needs to write into the switched-from task's BTS - * buffer. */ - return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts); + return (trace->ds.top - trace->ds.begin) / trace->ds.size; } -void ptrace_bts_take_timestamp(struct task_struct *tsk, - enum bts_qualifier qualifier) +static void ptrace_bts_fork(struct task_struct *tsk) { - struct bts_struct rec = { - .qualifier = qualifier, - .variant.jiffies = jiffies_64 - }; - - ptrace_bts_write_record(tsk, &rec); + tsk->bts = NULL; + tsk->bts_buffer = NULL; + tsk->bts_size = 0; + tsk->thread.bts_ovfl_signal = 0; } -static const struct bts_configuration bts_cfg_netburst = { - .sizeof_bts = sizeof(long) * 3, - .sizeof_field = sizeof(long), - .debugctl_mask = (1<<2)|(1<<3)|(1<<5) -}; +static void ptrace_bts_untrace(struct task_struct *child) +{ + if (unlikely(child->bts)) { + ds_release_bts(child->bts); + child->bts = NULL; + + /* We cannot update total_vm and locked_vm since + child's mm is already gone. But we can reclaim the + memory. */ + kfree(child->bts_buffer); + child->bts_buffer = NULL; + child->bts_size = 0; + } +} -static const struct bts_configuration bts_cfg_pentium_m = { - .sizeof_bts = sizeof(long) * 3, - .sizeof_field = sizeof(long), - .debugctl_mask = (1<<6)|(1<<7) -}; +static void ptrace_bts_detach(struct task_struct *child) +{ + if (unlikely(child->bts)) { + ds_release_bts(child->bts); + child->bts = NULL; -static const struct bts_configuration bts_cfg_core2 = { - .sizeof_bts = 8 * 3, - .sizeof_field = 8, - .debugctl_mask = (1<<6)|(1<<7)|(1<<9) -}; + ptrace_bts_free_buffer(child); + } +} +#else +static inline void ptrace_bts_fork(struct task_struct *tsk) {} +static inline void ptrace_bts_detach(struct task_struct *child) {} +static inline void ptrace_bts_untrace(struct task_struct *child) {} +#endif /* CONFIG_X86_PTRACE_BTS */ -static inline void bts_configure(const struct bts_configuration *cfg) +void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags) { - bts_cfg = *cfg; + ptrace_bts_fork(child); } -void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c) +void x86_ptrace_untrace(struct task_struct *child) { - switch (c->x86) { - case 0x6: - switch (c->x86_model) { - case 0xD: - case 0xE: /* Pentium M */ - bts_configure(&bts_cfg_pentium_m); - break; - case 0xF: /* Core2 */ - case 0x1C: /* Atom */ - bts_configure(&bts_cfg_core2); - break; - default: - /* sorry, don't know about them */ - break; - } - break; - case 0xF: - switch (c->x86_model) { - case 0x0: - case 0x1: - case 0x2: /* Netburst */ - bts_configure(&bts_cfg_netburst); - break; - default: - /* sorry, don't know about them */ - break; - } - break; - default: - /* sorry, don't know about them */ - break; - } + ptrace_bts_untrace(child); } -#endif /* CONFIG_X86_PTRACE_BTS */ /* * Called by kernel/ptrace.c when detaching.. @@ -972,15 +844,7 @@ void ptrace_disable(struct task_struct *child) #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); #endif -#ifdef CONFIG_X86_PTRACE_BTS - (void)ds_release_bts(child); - - child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; - if (!child->thread.debugctlmsr) - clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); - - clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); -#endif /* CONFIG_X86_PTRACE_BTS */ + ptrace_bts_detach(child); } #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION @@ -1112,7 +976,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) break; case PTRACE_BTS_SIZE: - ret = ds_get_bts_index(child, /* pos = */ NULL); + ret = ptrace_bts_size(child); break; case PTRACE_BTS_GET: @@ -1121,7 +985,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) break; case PTRACE_BTS_CLEAR: - ret = ds_clear_bts(child); + ret = ptrace_bts_clear(child); break; case PTRACE_BTS_DRAIN: @@ -1384,6 +1248,14 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, case PTRACE_GET_THREAD_AREA: case PTRACE_SET_THREAD_AREA: +#ifdef CONFIG_X86_PTRACE_BTS + case PTRACE_BTS_CONFIG: + case PTRACE_BTS_STATUS: + case PTRACE_BTS_SIZE: + case PTRACE_BTS_GET: + case PTRACE_BTS_CLEAR: + case PTRACE_BTS_DRAIN: +#endif /* CONFIG_X86_PTRACE_BTS */ return arch_ptrace(child, request, addr, data); default: diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index cc5a2545dd4..61f718df6ee 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -21,6 +21,9 @@ # include <asm/iommu.h> #endif +#include <mach_ipi.h> + + /* * Power off function, if any */ @@ -36,7 +39,10 @@ int reboot_force; static int reboot_cpu = -1; #endif -/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] +/* This is set by the PCI code if either type 1 or type 2 PCI is detected */ +bool port_cf9_safe = false; + +/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci] warm Don't set the cold reboot flag cold Set the cold reboot flag bios Reboot by jumping through the BIOS (only for X86_32) @@ -45,6 +51,7 @@ static int reboot_cpu = -1; kbd Use the keyboard controller. cold reset (default) acpi Use the RESET_REG in the FADT efi Use efi reset_system runtime service + pci Use the so-called "PCI reset register", CF9 force Avoid anything that could hang. */ static int __init reboot_setup(char *str) @@ -79,6 +86,7 @@ static int __init reboot_setup(char *str) case 'k': case 't': case 'e': + case 'p': reboot_type = *str; break; @@ -404,12 +412,27 @@ static void native_machine_emergency_restart(void) reboot_type = BOOT_KBD; break; - case BOOT_EFI: if (efi_enabled) - efi.reset_system(reboot_mode ? EFI_RESET_WARM : EFI_RESET_COLD, + efi.reset_system(reboot_mode ? + EFI_RESET_WARM : + EFI_RESET_COLD, EFI_SUCCESS, 0, NULL); + reboot_type = BOOT_KBD; + break; + case BOOT_CF9: + port_cf9_safe = true; + /* fall through */ + + case BOOT_CF9_COND: + if (port_cf9_safe) { + u8 cf9 = inb(0xcf9) & ~6; + outb(cf9|2, 0xcf9); /* Request hard reset */ + udelay(50); + outb(cf9|6, 0xcf9); /* Actually do the reset */ + udelay(50); + } reboot_type = BOOT_KBD; break; } @@ -470,6 +493,11 @@ static void native_machine_restart(char *__unused) static void native_machine_halt(void) { + /* stop other cpus and apics */ + machine_shutdown(); + + /* stop this cpu */ + stop_this_cpu(NULL); } static void native_machine_power_off(void) @@ -523,3 +551,95 @@ void machine_crash_shutdown(struct pt_regs *regs) machine_ops.crash_shutdown(regs); } #endif + + +#if defined(CONFIG_SMP) + +/* This keeps a track of which one is crashing cpu. */ +static int crashing_cpu; +static nmi_shootdown_cb shootdown_callback; + +static atomic_t waiting_for_crash_ipi; + +static int crash_nmi_callback(struct notifier_block *self, + unsigned long val, void *data) +{ + int cpu; + + if (val != DIE_NMI_IPI) + return NOTIFY_OK; + + cpu = raw_smp_processor_id(); + + /* Don't do anything if this handler is invoked on crashing cpu. + * Otherwise, system will completely hang. Crashing cpu can get + * an NMI if system was initially booted with nmi_watchdog parameter. + */ + if (cpu == crashing_cpu) + return NOTIFY_STOP; + local_irq_disable(); + + shootdown_callback(cpu, (struct die_args *)data); + + atomic_dec(&waiting_for_crash_ipi); + /* Assume hlt works */ + halt(); + for (;;) + cpu_relax(); + + return 1; +} + +static void smp_send_nmi_allbutself(void) +{ + cpumask_t mask = cpu_online_map; + cpu_clear(safe_smp_processor_id(), mask); + if (!cpus_empty(mask)) + send_IPI_mask(mask, NMI_VECTOR); +} + +static struct notifier_block crash_nmi_nb = { + .notifier_call = crash_nmi_callback, +}; + +/* Halt all other CPUs, calling the specified function on each of them + * + * This function can be used to halt all other CPUs on crash + * or emergency reboot time. The function passed as parameter + * will be called inside a NMI handler on all CPUs. + */ +void nmi_shootdown_cpus(nmi_shootdown_cb callback) +{ + unsigned long msecs; + local_irq_disable(); + + /* Make a note of crashing cpu. Will be used in NMI callback.*/ + crashing_cpu = safe_smp_processor_id(); + + shootdown_callback = callback; + + atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); + /* Would it be better to replace the trap vector here? */ + if (register_die_notifier(&crash_nmi_nb)) + return; /* return what? */ + /* Ensure the new callback function is set before sending + * out the NMI + */ + wmb(); + + smp_send_nmi_allbutself(); + + msecs = 1000; /* Wait at most a second for the other cpus to stop */ + while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) { + mdelay(1); + msecs--; + } + + /* Leave the nmi callback set */ +} +#else /* !CONFIG_SMP */ +void nmi_shootdown_cpus(nmi_shootdown_cb callback) +{ + /* No other CPUs to shoot down */ +} +#endif diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index 6f50664b2ba..a160f311972 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -10,15 +10,12 @@ #include <asm/page.h> #include <asm/kexec.h> #include <asm/processor-flags.h> -#include <asm/pgtable.h> /* * Must be relocatable PIC code callable as a C function */ #define PTR(x) (x << 2) -#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) -#define PAE_PGD_ATTR (_PAGE_PRESENT) /* control_page + KEXEC_CONTROL_CODE_MAX_SIZE * ~ control_page + PAGE_SIZE are used as data storage and stack for @@ -39,7 +36,6 @@ #define CP_PA_BACKUP_PAGES_MAP DATA(0x1c) .text - .align PAGE_SIZE .globl relocate_kernel relocate_kernel: /* Save the CPU context, used for jumping back */ @@ -60,117 +56,6 @@ relocate_kernel: movl %cr4, %eax movl %eax, CR4(%edi) -#ifdef CONFIG_X86_PAE - /* map the control page at its virtual address */ - - movl PTR(VA_PGD)(%ebp), %edi - movl PTR(VA_CONTROL_PAGE)(%ebp), %eax - andl $0xc0000000, %eax - shrl $27, %eax - addl %edi, %eax - - movl PTR(PA_PMD_0)(%ebp), %edx - orl $PAE_PGD_ATTR, %edx - movl %edx, (%eax) - - movl PTR(VA_PMD_0)(%ebp), %edi - movl PTR(VA_CONTROL_PAGE)(%ebp), %eax - andl $0x3fe00000, %eax - shrl $18, %eax - addl %edi, %eax - - movl PTR(PA_PTE_0)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) - - movl PTR(VA_PTE_0)(%ebp), %edi - movl PTR(VA_CONTROL_PAGE)(%ebp), %eax - andl $0x001ff000, %eax - shrl $9, %eax - addl %edi, %eax - - movl PTR(PA_CONTROL_PAGE)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) - - /* identity map the control page at its physical address */ - - movl PTR(VA_PGD)(%ebp), %edi - movl PTR(PA_CONTROL_PAGE)(%ebp), %eax - andl $0xc0000000, %eax - shrl $27, %eax - addl %edi, %eax - - movl PTR(PA_PMD_1)(%ebp), %edx - orl $PAE_PGD_ATTR, %edx - movl %edx, (%eax) - - movl PTR(VA_PMD_1)(%ebp), %edi - movl PTR(PA_CONTROL_PAGE)(%ebp), %eax - andl $0x3fe00000, %eax - shrl $18, %eax - addl %edi, %eax - - movl PTR(PA_PTE_1)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) - - movl PTR(VA_PTE_1)(%ebp), %edi - movl PTR(PA_CONTROL_PAGE)(%ebp), %eax - andl $0x001ff000, %eax - shrl $9, %eax - addl %edi, %eax - - movl PTR(PA_CONTROL_PAGE)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) -#else - /* map the control page at its virtual address */ - - movl PTR(VA_PGD)(%ebp), %edi - movl PTR(VA_CONTROL_PAGE)(%ebp), %eax - andl $0xffc00000, %eax - shrl $20, %eax - addl %edi, %eax - - movl PTR(PA_PTE_0)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) - - movl PTR(VA_PTE_0)(%ebp), %edi - movl PTR(VA_CONTROL_PAGE)(%ebp), %eax - andl $0x003ff000, %eax - shrl $10, %eax - addl %edi, %eax - - movl PTR(PA_CONTROL_PAGE)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) - - /* identity map the control page at its physical address */ - - movl PTR(VA_PGD)(%ebp), %edi - movl PTR(PA_CONTROL_PAGE)(%ebp), %eax - andl $0xffc00000, %eax - shrl $20, %eax - addl %edi, %eax - - movl PTR(PA_PTE_1)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) - - movl PTR(VA_PTE_1)(%ebp), %edi - movl PTR(PA_CONTROL_PAGE)(%ebp), %eax - andl $0x003ff000, %eax - shrl $10, %eax - addl %edi, %eax - - movl PTR(PA_CONTROL_PAGE)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) -#endif - -relocate_new_kernel: /* read the arguments and say goodbye to the stack */ movl 20+4(%esp), %ebx /* page_list */ movl 20+8(%esp), %ebp /* list of pages */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 9d5674f7b6c..08e02e8453c 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -93,11 +93,13 @@ #include <asm/desc.h> #include <asm/dma.h> #include <asm/iommu.h> +#include <asm/gart.h> #include <asm/mmu_context.h> #include <asm/proto.h> #include <mach_apic.h> #include <asm/paravirt.h> +#include <asm/hypervisor.h> #include <asm/percpu.h> #include <asm/topology.h> @@ -448,6 +450,7 @@ static void __init reserve_early_setup_data(void) * @size: Size of the crashkernel memory to reserve. * Returns the base address on success, and -1ULL on failure. */ +static unsigned long long __init find_and_reserve_crashkernel(unsigned long long size) { const unsigned long long alignment = 16<<20; /* 16M */ @@ -583,161 +586,24 @@ static int __init setup_elfcorehdr(char *arg) early_param("elfcorehdr", setup_elfcorehdr); #endif -static struct x86_quirks default_x86_quirks __initdata; - -struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; - -/* - * Some BIOSes seem to corrupt the low 64k of memory during events - * like suspend/resume and unplugging an HDMI cable. Reserve all - * remaining free memory in that area and fill it with a distinct - * pattern. - */ -#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION -#define MAX_SCAN_AREAS 8 - -static int __read_mostly memory_corruption_check = -1; - -static unsigned __read_mostly corruption_check_size = 64*1024; -static unsigned __read_mostly corruption_check_period = 60; /* seconds */ - -static struct e820entry scan_areas[MAX_SCAN_AREAS]; -static int num_scan_areas; - - -static int set_corruption_check(char *arg) +static int __init default_update_genapic(void) { - char *end; - - memory_corruption_check = simple_strtol(arg, &end, 10); - - return (*end == 0) ? 0 : -EINVAL; -} -early_param("memory_corruption_check", set_corruption_check); - -static int set_corruption_check_period(char *arg) -{ - char *end; - - corruption_check_period = simple_strtoul(arg, &end, 10); - - return (*end == 0) ? 0 : -EINVAL; -} -early_param("memory_corruption_check_period", set_corruption_check_period); - -static int set_corruption_check_size(char *arg) -{ - char *end; - unsigned size; - - size = memparse(arg, &end); - - if (*end == '\0') - corruption_check_size = size; - - return (size == corruption_check_size) ? 0 : -EINVAL; -} -early_param("memory_corruption_check_size", set_corruption_check_size); - - -static void __init setup_bios_corruption_check(void) -{ - u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */ - - if (memory_corruption_check == -1) { - memory_corruption_check = -#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK - 1 -#else - 0 +#ifdef CONFIG_X86_SMP +# if defined(CONFIG_X86_GENERICARCH) || defined(CONFIG_X86_64) + genapic->wakeup_cpu = wakeup_secondary_cpu_via_init; +# endif #endif - ; - } - - if (corruption_check_size == 0) - memory_corruption_check = 0; - - if (!memory_corruption_check) - return; - - corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); - - while(addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { - u64 size; - addr = find_e820_area_size(addr, &size, PAGE_SIZE); - - if (addr == 0) - break; - if ((addr + size) > corruption_check_size) - size = corruption_check_size - addr; - - if (size == 0) - break; - - e820_update_range(addr, size, E820_RAM, E820_RESERVED); - scan_areas[num_scan_areas].addr = addr; - scan_areas[num_scan_areas].size = size; - num_scan_areas++; - - /* Assume we've already mapped this early memory */ - memset(__va(addr), 0, size); - - addr += size; - } - - printk(KERN_INFO "Scanning %d areas for low memory corruption\n", - num_scan_areas); - update_e820(); -} - -static struct timer_list periodic_check_timer; - -void check_for_bios_corruption(void) -{ - int i; - int corruption = 0; - - if (!memory_corruption_check) - return; - - for(i = 0; i < num_scan_areas; i++) { - unsigned long *addr = __va(scan_areas[i].addr); - unsigned long size = scan_areas[i].size; - - for(; size; addr++, size -= sizeof(unsigned long)) { - if (!*addr) - continue; - printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n", - addr, __pa(addr), *addr); - corruption = 1; - *addr = 0; - } - } - - WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n"); -} - -static void periodic_check_for_corruption(unsigned long data) -{ - check_for_bios_corruption(); - mod_timer(&periodic_check_timer, round_jiffies(jiffies + corruption_check_period*HZ)); + return 0; } -void start_periodic_check_for_corruption(void) -{ - if (!memory_corruption_check || corruption_check_period == 0) - return; - - printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n", - corruption_check_period); +static struct x86_quirks default_x86_quirks __initdata = { + .update_genapic = default_update_genapic, +}; - init_timer(&periodic_check_timer); - periodic_check_timer.function = &periodic_check_for_corruption; - periodic_check_for_corruption(0); -} -#endif +struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; +#ifdef CONFIG_X86_RESERVE_LOW_64K static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) { printk(KERN_NOTICE @@ -749,6 +615,7 @@ static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) return 0; } +#endif /* List of systems that have known low memory corruption BIOS problems */ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { @@ -794,6 +661,9 @@ void __init setup_arch(char **cmdline_p) printk(KERN_INFO "Command line: %s\n", boot_command_line); #endif + /* VMI may relocate the fixmap; do this before touching ioremap area */ + vmi_init(); + early_cpu_init(); early_ioremap_init(); @@ -880,13 +750,8 @@ void __init setup_arch(char **cmdline_p) check_efer(); #endif -#if defined(CONFIG_VMI) && defined(CONFIG_X86_32) - /* - * Must be before kernel pagetables are setup - * or fixmap area is touched. - */ - vmi_init(); -#endif + /* Must be before kernel pagetables are setup */ + vmi_activate(); /* after early param, so could get panic from serial */ reserve_early_setup_data(); @@ -909,6 +774,12 @@ void __init setup_arch(char **cmdline_p) dmi_check_system(bad_bios_dmi_table); + /* + * VMware detection requires dmi to be available, so this + * needs to be done after dmi_scan_machine, for the BP. + */ + init_hypervisor(&boot_cpu_data); + #ifdef CONFIG_X86_32 probe_roms(); #endif diff --git a/arch/x86/kernel/sigframe.h b/arch/x86/kernel/sigframe.h deleted file mode 100644 index cc673aa55ce..00000000000 --- a/arch/x86/kernel/sigframe.h +++ /dev/null @@ -1,42 +0,0 @@ -#ifdef CONFIG_X86_32 -struct sigframe { - char __user *pretcode; - int sig; - struct sigcontext sc; - /* - * fpstate is unused. fpstate is moved/allocated after - * retcode[] below. This movement allows to have the FP state and the - * future state extensions (xsave) stay together. - * And at the same time retaining the unused fpstate, prevents changing - * the offset of extramask[] in the sigframe and thus prevent any - * legacy application accessing/modifying it. - */ - struct _fpstate fpstate_unused; - unsigned long extramask[_NSIG_WORDS-1]; - char retcode[8]; - /* fp state follows here */ -}; - -struct rt_sigframe { - char __user *pretcode; - int sig; - struct siginfo __user *pinfo; - void __user *puc; - struct siginfo info; - struct ucontext uc; - char retcode[8]; - /* fp state follows here */ -}; -#else -struct rt_sigframe { - char __user *pretcode; - struct ucontext uc; - struct siginfo info; - /* fp state follows here */ -}; - -int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs *regs); -int ia32_setup_frame(int sig, struct k_sigaction *ka, - sigset_t *set, struct pt_regs *regs); -#endif diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index dee553c503d..4fa5243c206 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -35,7 +35,7 @@ #include <asm/syscall.h> #include <asm/syscalls.h> -#include "sigframe.h" +#include <asm/sigframe.h> #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) @@ -594,17 +594,7 @@ asmlinkage unsigned long sys_sigreturn(unsigned long __unused) return ax; badframe: - if (show_unhandled_signals && printk_ratelimit()) { - printk("%s%s[%d] bad frame in sigreturn frame:" - "%p ip:%lx sp:%lx oeax:%lx", - task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, - current->comm, task_pid_nr(current), frame, regs->ip, - regs->sp, regs->orig_ax); - print_vma_addr(" in ", regs->ip); - printk(KERN_CONT "\n"); - } - - force_sig(SIGSEGV, current); + signal_fault(regs, frame, "sigreturn"); return 0; } @@ -681,6 +671,11 @@ static int signr_convert(int sig) #define is_ia32 0 #endif /* CONFIG_IA32_EMULATION */ +int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs); +int ia32_setup_frame(int sig, struct k_sigaction *ka, + sigset_t *set, struct pt_regs *regs); + #endif /* CONFIG_X86_32 */ static int @@ -906,8 +901,9 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where) struct task_struct *me = current; if (show_unhandled_signals && printk_ratelimit()) { - printk(KERN_INFO + printk("%s" "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", + task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, me->comm, me->pid, where, frame, regs->ip, regs->sp, regs->orig_ax); print_vma_addr(" in ", regs->ip); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index d18537ce2c7..7e558db362c 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -140,19 +140,6 @@ void native_send_call_func_ipi(cpumask_t mask) send_IPI_mask(mask, CALL_FUNCTION_VECTOR); } -static void stop_this_cpu(void *dummy) -{ - local_irq_disable(); - /* - * Remove this CPU: - */ - cpu_clear(smp_processor_id(), cpu_online_map); - disable_local_APIC(); - if (hlt_works(smp_processor_id())) - for (;;) halt(); - for (;;); -} - /* * this function calls the 'stop' function on all other CPUs in the system. */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7b109339731..f8500c96944 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -62,6 +62,7 @@ #include <asm/mtrr.h> #include <asm/vmi.h> #include <asm/genapic.h> +#include <asm/setup.h> #include <linux/mc146818rtc.h> #include <mach_apic.h> @@ -287,16 +288,14 @@ static int __cpuinitdata unsafe_smp; /* * Activate a secondary processor. */ -static void __cpuinit start_secondary(void *unused) +notrace static void __cpuinit start_secondary(void *unused) { /* * Don't put *anything* before cpu_init(), SMP booting is too * fragile that we want to limit the things done here to the * most necessary things. */ -#ifdef CONFIG_VMI vmi_bringup(); -#endif cpu_init(); preempt_disable(); smp_callin(); @@ -536,7 +535,7 @@ static void impress_friends(void) pr_debug("Before bogocount - setting activated=1.\n"); } -static inline void __inquire_remote_apic(int apicid) +void __inquire_remote_apic(int apicid) { unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; char *names[] = { "ID", "VERSION", "SPIV" }; @@ -575,14 +574,13 @@ static inline void __inquire_remote_apic(int apicid) } } -#ifdef WAKE_SECONDARY_VIA_NMI /* * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this * won't ... remember to clear down the APIC, etc later. */ -static int __devinit -wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) +int __devinit +wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) { unsigned long send_status, accept_status = 0; int maxlvt; @@ -599,7 +597,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) * Give the other CPU some time to accept the IPI. */ udelay(200); - if (APIC_INTEGRATED(apic_version[phys_apicid])) { + if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { maxlvt = lapic_get_maxlvt(); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); @@ -614,11 +612,9 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) return (send_status | accept_status); } -#endif /* WAKE_SECONDARY_VIA_NMI */ -#ifdef WAKE_SECONDARY_VIA_INIT -static int __devinit -wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) +int __devinit +wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) { unsigned long send_status, accept_status = 0; int maxlvt, num_starts, j; @@ -737,7 +733,6 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) return (send_status | accept_status); } -#endif /* WAKE_SECONDARY_VIA_INIT */ struct create_idle { struct work_struct work; @@ -1086,8 +1081,10 @@ static int __init smp_sanity_check(unsigned max_cpus) #endif if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { - printk(KERN_WARNING "weird, boot CPU (#%d) not listed" - "by the BIOS.\n", hard_smp_processor_id()); + printk(KERN_WARNING + "weird, boot CPU (#%d) not listed by the BIOS.\n", + hard_smp_processor_id()); + physid_set(hard_smp_processor_id(), phys_cpu_present_map); } diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index a03e7f6d90c..10786af9554 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -6,6 +6,7 @@ #include <linux/sched.h> #include <linux/stacktrace.h> #include <linux/module.h> +#include <linux/uaccess.h> #include <asm/stacktrace.h> static void save_stack_warning(void *data, char *msg) @@ -83,3 +84,66 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); + +/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ + +struct stack_frame { + const void __user *next_fp; + unsigned long ret_addr; +}; + +static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +{ + int ret; + + if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) + return 0; + + ret = 1; + pagefault_disable(); + if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) + ret = 0; + pagefault_enable(); + + return ret; +} + +static inline void __save_stack_trace_user(struct stack_trace *trace) +{ + const struct pt_regs *regs = task_pt_regs(current); + const void __user *fp = (const void __user *)regs->bp; + + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = regs->ip; + + while (trace->nr_entries < trace->max_entries) { + struct stack_frame frame; + + frame.next_fp = NULL; + frame.ret_addr = 0; + if (!copy_stack_frame(fp, &frame)) + break; + if ((unsigned long)fp < regs->sp) + break; + if (frame.ret_addr) { + trace->entries[trace->nr_entries++] = + frame.ret_addr; + } + if (fp == frame.next_fp) + break; + fp = frame.next_fp; + } +} + +void save_stack_trace_user(struct stack_trace *trace) +{ + /* + * Trace user stack if we are not a kernel thread + */ + if (current->mm) { + __save_stack_trace_user(trace); + } + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = ULONG_MAX; +} + diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 77b400f06ea..65309e4cb1c 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -75,7 +75,7 @@ EXPORT_SYMBOL(profile_pc); irqreturn_t timer_interrupt(int irq, void *dev_id) { /* Keep nmi watchdog up to date */ - per_cpu(irq_stat, smp_processor_id()).irq0_irqs++; + inc_irq_stat(irq0_irqs); #ifdef CONFIG_X86_IO_APIC if (timer_ack) { diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 418a095c579..891e7a7c433 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -49,9 +49,9 @@ unsigned long profile_pc(struct pt_regs *regs) } EXPORT_SYMBOL(profile_pc); -irqreturn_t timer_interrupt(int irq, void *dev_id) +static irqreturn_t timer_interrupt(int irq, void *dev_id) { - add_pda(irq0_irqs, 1); + inc_irq_stat(irq0_irqs); global_clock_event->event_handler(global_clock_event); diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c index f4049f3513b..8da059f949b 100644 --- a/arch/x86/kernel/tlb_32.c +++ b/arch/x86/kernel/tlb_32.c @@ -34,9 +34,8 @@ static DEFINE_SPINLOCK(tlbstate_lock); */ void leave_mm(int cpu) { - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) - BUG(); - cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask); + BUG_ON(x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK); + cpu_clear(cpu, x86_read_percpu(cpu_tlbstate.active_mm)->cpu_vm_mask); load_cr3(swapper_pg_dir); } EXPORT_SYMBOL_GPL(leave_mm); @@ -104,8 +103,8 @@ void smp_invalidate_interrupt(struct pt_regs *regs) * BUG(); */ - if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) { - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) { + if (flush_mm == x86_read_percpu(cpu_tlbstate.active_mm)) { + if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) { if (flush_va == TLB_FLUSH_ALL) local_flush_tlb(); else @@ -119,7 +118,7 @@ void smp_invalidate_interrupt(struct pt_regs *regs) smp_mb__after_clear_bit(); out: put_cpu_no_resched(); - __get_cpu_var(irq_stat).irq_tlb_count++; + inc_irq_stat(irq_tlb_count); } void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, @@ -238,7 +237,7 @@ static void do_flush_tlb_all(void *info) unsigned long cpu = smp_processor_id(); __flush_tlb_all(); - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY) + if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_LAZY) leave_mm(cpu); } diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c index 8f919ca6949..29887d7081a 100644 --- a/arch/x86/kernel/tlb_64.c +++ b/arch/x86/kernel/tlb_64.c @@ -154,7 +154,7 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs) out: ack_APIC_irq(); cpu_clear(cpu, f->flush_cpumask); - add_pda(irq_tlb_count, 1); + inc_irq_stat(irq_tlb_count); } void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 04431f34fd1..6a00e5faaa7 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -566,14 +566,10 @@ static int __init uv_ptc_init(void) if (!is_uv_system()) return 0; - if (!proc_mkdir("sgi_uv", NULL)) - return -EINVAL; - proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL); if (!proc_uv_ptc) { printk(KERN_ERR "unable to create %s proc entry\n", UV_PTC_BASENAME); - remove_proc_entry("sgi_uv", NULL); return -EINVAL; } proc_uv_ptc->proc_fops = &proc_uv_ptc_operations; diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index 1106fac6024..808031a5ba1 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -1,10 +1,26 @@ #include <linux/io.h> #include <asm/trampoline.h> +#include <asm/e820.h> /* ready for x86_64 and x86 */ unsigned char *trampoline_base = __va(TRAMPOLINE_BASE); +void __init reserve_trampoline_memory(void) +{ +#ifdef CONFIG_X86_32 + /* + * But first pinch a few for the stack/trampoline stuff + * FIXME: Don't need the extra page at 4K, but need to fix + * trampoline before removing it. (see the GDT stuff) + */ + reserve_early(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE"); +#endif + /* Has to be in very low memory so we can execute real-mode AP code. */ + reserve_early(TRAMPOLINE_BASE, TRAMPOLINE_BASE + TRAMPOLINE_SIZE, + "TRAMPOLINE"); +} + /* * Currently trivial. Write the real->protected mode * bootstrap into the page concerned. The caller @@ -12,7 +28,6 @@ unsigned char *trampoline_base = __va(TRAMPOLINE_BASE); */ unsigned long setup_trampoline(void) { - memcpy(trampoline_base, trampoline_data, - trampoline_end - trampoline_data); + memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); return virt_to_phys(trampoline_base); } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index d815293e6d9..141907ab6e2 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -660,7 +660,7 @@ void math_error(void __user *ip) { struct task_struct *task; siginfo_t info; - unsigned short cwd, swd; + unsigned short cwd, swd, err; /* * Save the info for the exception handler and clear the error. @@ -671,7 +671,6 @@ void math_error(void __user *ip) task->thread.error_code = 0; info.si_signo = SIGFPE; info.si_errno = 0; - info.si_code = __SI_FAULT; info.si_addr = ip; /* * (~cwd & swd) will mask out exceptions that are not set to unmasked @@ -685,34 +684,31 @@ void math_error(void __user *ip) */ cwd = get_fpu_cwd(task); swd = get_fpu_swd(task); - switch (swd & ~cwd & 0x3f) { - case 0x000: /* No unmasked exception */ + + err = swd & ~cwd & 0x3f; + #ifdef CONFIG_X86_32 + if (!err) return; #endif - default: /* Multiple exceptions */ - break; - case 0x001: /* Invalid Op */ + + if (err & 0x001) { /* Invalid op */ /* * swd & 0x240 == 0x040: Stack Underflow * swd & 0x240 == 0x240: Stack Overflow * User must clear the SF bit (0x40) if set */ info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ + } else if (err & 0x004) { /* Divide by Zero */ info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ + } else if (err & 0x008) { /* Overflow */ info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ + } else if (err & 0x012) { /* Denormal, Underflow */ + info.si_code = FPE_FLTUND; + } else if (err & 0x020) { /* Precision */ info.si_code = FPE_FLTRES; - break; + } else { + info.si_code = __SI_FAULT|SI_KERNEL; /* WTF? */ } force_sig_info(SIGFPE, &info, task); } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 424093b157d..599e5816863 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -15,6 +15,7 @@ #include <asm/vgtod.h> #include <asm/time.h> #include <asm/delay.h> +#include <asm/hypervisor.h> unsigned int cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); @@ -31,6 +32,7 @@ static int tsc_unstable; erroneous rdtsc usage on !cpu_has_tsc processors */ static int tsc_disabled = -1; +static int tsc_clocksource_reliable; /* * Scheduler clock - returns current time in nanosec units. */ @@ -98,6 +100,15 @@ int __init notsc_setup(char *str) __setup("notsc", notsc_setup); +static int __init tsc_setup(char *str) +{ + if (!strcmp(str, "reliable")) + tsc_clocksource_reliable = 1; + return 1; +} + +__setup("tsc=", tsc_setup); + #define MAX_RETRIES 5 #define SMI_TRESHOLD 50000 @@ -352,9 +363,15 @@ unsigned long native_calibrate_tsc(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; - unsigned long flags, latch, ms, fast_calibrate; + unsigned long flags, latch, ms, fast_calibrate, tsc_khz; int hpet = is_hpet_enabled(), i, loopmin; + tsc_khz = get_hypervisor_tsc_freq(); + if (tsc_khz) { + printk(KERN_INFO "TSC: Frequency read from the hypervisor\n"); + return tsc_khz; + } + local_irq_save(flags); fast_calibrate = quick_pit_calibrate(); local_irq_restore(flags); @@ -731,24 +748,21 @@ static struct dmi_system_id __initdata bad_tsc_dmi_table[] = { {} }; -/* - * Geode_LX - the OLPC CPU has a possibly a very reliable TSC - */ +static void __init check_system_tsc_reliable(void) +{ #ifdef CONFIG_MGEODE_LX -/* RTSC counts during suspend */ + /* RTSC counts during suspend */ #define RTSC_SUSP 0x100 - -static void __init check_geode_tsc_reliable(void) -{ unsigned long res_low, res_high; rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); + /* Geode_LX - the OLPC CPU has a possibly a very reliable TSC */ if (res_low & RTSC_SUSP) - clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; -} -#else -static inline void check_geode_tsc_reliable(void) { } + tsc_clocksource_reliable = 1; #endif + if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) + tsc_clocksource_reliable = 1; +} /* * Make an educated guess if the TSC is trustworthy and synchronized @@ -783,6 +797,8 @@ static void __init init_tsc_clocksource(void) { clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, clocksource_tsc.shift); + if (tsc_clocksource_reliable) + clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; /* lower the rating if we already know its unstable: */ if (check_tsc_unstable()) { clocksource_tsc.rating = 0; @@ -843,7 +859,7 @@ void __init tsc_init(void) if (unsynchronized_tsc()) mark_tsc_unstable("TSCs unsynchronized"); - check_geode_tsc_reliable(); + check_system_tsc_reliable(); init_tsc_clocksource(); } diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 1c0dfbca87c..bf36328f6ef 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -112,6 +112,12 @@ void __cpuinit check_tsc_sync_source(int cpu) if (unsynchronized_tsc()) return; + if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { + printk(KERN_INFO + "Skipping synchronization checks as TSC is reliable.\n"); + return; + } + printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:", smp_processor_id(), cpu); @@ -165,7 +171,7 @@ void __cpuinit check_tsc_sync_target(void) { int cpus = 2; - if (unsynchronized_tsc()) + if (unsynchronized_tsc() || boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) return; /* diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 8b6c393ab9f..23206ba1687 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -266,109 +266,6 @@ static void vmi_nop(void) { } -#ifdef CONFIG_DEBUG_PAGE_TYPE - -#ifdef CONFIG_X86_PAE -#define MAX_BOOT_PTS (2048+4+1) -#else -#define MAX_BOOT_PTS (1024+1) -#endif - -/* - * During boot, mem_map is not yet available in paging_init, so stash - * all the boot page allocations here. - */ -static struct { - u32 pfn; - int type; -} boot_page_allocations[MAX_BOOT_PTS]; -static int num_boot_page_allocations; -static int boot_allocations_applied; - -void vmi_apply_boot_page_allocations(void) -{ - int i; - BUG_ON(!mem_map); - for (i = 0; i < num_boot_page_allocations; i++) { - struct page *page = pfn_to_page(boot_page_allocations[i].pfn); - page->type = boot_page_allocations[i].type; - page->type = boot_page_allocations[i].type & - ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE); - } - boot_allocations_applied = 1; -} - -static void record_page_type(u32 pfn, int type) -{ - BUG_ON(num_boot_page_allocations >= MAX_BOOT_PTS); - boot_page_allocations[num_boot_page_allocations].pfn = pfn; - boot_page_allocations[num_boot_page_allocations].type = type; - num_boot_page_allocations++; -} - -static void check_zeroed_page(u32 pfn, int type, struct page *page) -{ - u32 *ptr; - int i; - int limit = PAGE_SIZE / sizeof(int); - - if (page_address(page)) - ptr = (u32 *)page_address(page); - else - ptr = (u32 *)__va(pfn << PAGE_SHIFT); - /* - * When cloning the root in non-PAE mode, only the userspace - * pdes need to be zeroed. - */ - if (type & VMI_PAGE_CLONE) - limit = KERNEL_PGD_BOUNDARY; - for (i = 0; i < limit; i++) - BUG_ON(ptr[i]); -} - -/* - * We stash the page type into struct page so we can verify the page - * types are used properly. - */ -static void vmi_set_page_type(u32 pfn, int type) -{ - /* PAE can have multiple roots per page - don't track */ - if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP)) - return; - - if (boot_allocations_applied) { - struct page *page = pfn_to_page(pfn); - if (type != VMI_PAGE_NORMAL) - BUG_ON(page->type); - else - BUG_ON(page->type == VMI_PAGE_NORMAL); - page->type = type & ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE); - if (type & VMI_PAGE_ZEROED) - check_zeroed_page(pfn, type, page); - } else { - record_page_type(pfn, type); - } -} - -static void vmi_check_page_type(u32 pfn, int type) -{ - /* PAE can have multiple roots per page - skip checks */ - if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP)) - return; - - type &= ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE); - if (boot_allocations_applied) { - struct page *page = pfn_to_page(pfn); - BUG_ON((page->type ^ type) & VMI_PAGE_PAE); - BUG_ON(type == VMI_PAGE_NORMAL && page->type); - BUG_ON((type & page->type) == 0); - } -} -#else -#define vmi_set_page_type(p,t) do { } while (0) -#define vmi_check_page_type(p,t) do { } while (0) -#endif - #ifdef CONFIG_HIGHPTE static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) { @@ -395,7 +292,6 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn) { - vmi_set_page_type(pfn, VMI_PAGE_L1); vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); } @@ -406,27 +302,22 @@ static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn) * It is called only for swapper_pg_dir, which already has * data on it. */ - vmi_set_page_type(pfn, VMI_PAGE_L2); vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0); } static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count) { - vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE); - vmi_check_page_type(clonepfn, VMI_PAGE_L2); vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count); } static void vmi_release_pte(unsigned long pfn) { vmi_ops.release_page(pfn, VMI_PAGE_L1); - vmi_set_page_type(pfn, VMI_PAGE_NORMAL); } static void vmi_release_pmd(unsigned long pfn) { vmi_ops.release_page(pfn, VMI_PAGE_L2); - vmi_set_page_type(pfn, VMI_PAGE_NORMAL); } /* @@ -450,26 +341,22 @@ static void vmi_release_pmd(unsigned long pfn) static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); } static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0)); } static void vmi_set_pte(pte_t *ptep, pte_t pte) { /* XXX because of set_pmd_pte, this can be called on PT or PD layers */ - vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE | VMI_PAGE_PD); vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT); } static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); } @@ -477,10 +364,8 @@ static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval) { #ifdef CONFIG_X86_PAE const pte_t pte = { .pte = pmdval.pmd }; - vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD); #else const pte_t pte = { pmdval.pud.pgd.pgd }; - vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PGD); #endif vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD); } @@ -502,7 +387,6 @@ static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval) static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1)); } @@ -510,21 +394,18 @@ static void vmi_set_pud(pud_t *pudp, pud_t pudval) { /* Um, eww */ const pte_t pte = { .pte = pudval.pgd.pgd }; - vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD); vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP); } static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { const pte_t pte = { .pte = 0 }; - vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); } static void vmi_pmd_clear(pmd_t *pmd) { const pte_t pte = { .pte = 0 }; - vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD); vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD); } #endif @@ -960,8 +841,6 @@ static inline int __init activate_vmi(void) void __init vmi_init(void) { - unsigned long flags; - if (!vmi_rom) probe_vmi_rom(); else @@ -973,13 +852,21 @@ void __init vmi_init(void) reserve_top_address(-vmi_rom->virtual_top); - local_irq_save(flags); - activate_vmi(); - #ifdef CONFIG_X86_IO_APIC /* This is virtual hardware; timer routing is wired correctly */ no_timer_check = 1; #endif +} + +void vmi_activate(void) +{ + unsigned long flags; + + if (!vmi_rom) + return; + + local_irq_save(flags); + activate_vmi(); local_irq_restore(flags & X86_EFLAGS_IF); } diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index a9b8560adbc..82c67559dde 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -44,6 +44,7 @@ SECTIONS SCHED_TEXT LOCK_TEXT KPROBES_TEXT + IRQENTRY_TEXT *(.fixup) *(.gnu.warning) _etext = .; /* End of text section */ diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 46e05447405..1a614c0e6be 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -35,6 +35,7 @@ SECTIONS SCHED_TEXT LOCK_TEXT KPROBES_TEXT + IRQENTRY_TEXT *(.fixup) *(.gnu.warning) _etext = .; /* End of text section */ diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index ebf2f12900f..44153afc906 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -17,6 +17,9 @@ * want per guest time just set the kernel.vsyscall64 sysctl to 0. */ +/* Disable profiling for userspace code: */ +#define DISABLE_BRANCH_PROFILING + #include <linux/time.h> #include <linux/init.h> #include <linux/kernel.h> |