diff options
Diffstat (limited to 'arch/x86')
50 files changed, 690 insertions, 1761 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e330da21b84..97b528d660a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1141,16 +1141,16 @@ config NUMA comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI" depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI) -config K8_NUMA +config AMD_NUMA def_bool y prompt "Old style AMD Opteron NUMA detection" depends on X86_64 && NUMA && PCI ---help--- - Enable K8 NUMA node topology detection. You should say Y here if - you have a multi processor AMD K8 system. This uses an old - method to read the NUMA configuration directly from the builtin - Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA - instead, which also takes priority if both are compiled in. + Enable AMD NUMA node topology detection. You should say Y here if + you have a multi processor AMD system. This uses an old method to + read the NUMA configuration directly from the builtin Northbridge + of Opteron. It is recommended to use X86_64_ACPI_NUMA instead, + which also takes priority if both are compiled in. config X86_64_ACPI_NUMA def_bool y diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 76561d20ea2..4a2adaa9aef 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -180,8 +180,15 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len); * On the local CPU you need to be protected again NMI or MCE handlers seeing an * inconsistent instruction while you patch. */ +struct text_poke_param { + void *addr; + const void *opcode; + size_t len; +}; + extern void *text_poke(void *addr, const void *opcode, size_t len); extern void *text_poke_smp(void *addr, const void *opcode, size_t len); +extern void text_poke_smp_batch(struct text_poke_param *params, int n); #if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) #define IDEAL_NOP_SIZE_5 5 diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index c8517f81b21..6aee50d655d 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -3,36 +3,53 @@ #include <linux/pci.h> -extern struct pci_device_id k8_nb_ids[]; +extern struct pci_device_id amd_nb_misc_ids[]; struct bootnode; -extern int early_is_k8_nb(u32 value); -extern int cache_k8_northbridges(void); -extern void k8_flush_garts(void); -extern int k8_get_nodes(struct bootnode *nodes); -extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn); -extern int k8_scan_nodes(void); +extern int early_is_amd_nb(u32 value); +extern int amd_cache_northbridges(void); +extern void amd_flush_garts(void); +extern int amd_get_nodes(struct bootnode *nodes); +extern int amd_numa_init(unsigned long start_pfn, unsigned long end_pfn); +extern int amd_scan_nodes(void); -struct k8_northbridge_info { +struct amd_northbridge { + struct pci_dev *misc; +}; + +struct amd_northbridge_info { u16 num; - u8 gart_supported; - struct pci_dev **nb_misc; + u64 flags; + struct amd_northbridge *nb; }; -extern struct k8_northbridge_info k8_northbridges; +extern struct amd_northbridge_info amd_northbridges; + +#define AMD_NB_GART 0x1 +#define AMD_NB_L3_INDEX_DISABLE 0x2 #ifdef CONFIG_AMD_NB -static inline struct pci_dev *node_to_k8_nb_misc(int node) +static inline int amd_nb_num(void) { - return (node < k8_northbridges.num) ? k8_northbridges.nb_misc[node] : NULL; + return amd_northbridges.num; } -#else +static inline int amd_nb_has_feature(int feature) +{ + return ((amd_northbridges.flags & feature) == feature); +} -static inline struct pci_dev *node_to_k8_nb_misc(int node) +static inline struct amd_northbridge *node_to_amd_nb(int node) { - return NULL; + return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL; } + +#else + +#define amd_nb_num(x) 0 +#define amd_nb_has_feature(x) false +#define node_to_amd_nb(x) NULL + #endif diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 13b0ebaa512..ba870bb6dd8 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -15,10 +15,6 @@ static inline int irq_canonicalize(int irq) return ((irq == 2) ? 9 : irq); } -#ifdef CONFIG_X86_LOCAL_APIC -# define ARCH_HAS_NMI_WATCHDOG -#endif - #ifdef CONFIG_X86_32 extern void irq_ctx_init(int cpu); #else diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index 5bdfca86581..f23eb252846 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -28,7 +28,7 @@ extern void die(const char *, struct pt_regs *,long); extern int __must_check __die(const char *, struct pt_regs *, long); extern void show_registers(struct pt_regs *regs); extern void show_trace(struct task_struct *t, struct pt_regs *regs, - unsigned long *sp, unsigned long bp); + unsigned long *sp); extern void __show_regs(struct pt_regs *regs, int all); extern void show_regs(struct pt_regs *regs); extern unsigned long oops_begin(void); diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 6b89f5e8602..86030f63ba0 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -123,6 +123,10 @@ #define MSR_AMD64_IBSCTL 0xc001103a #define MSR_AMD64_IBSBRTARGET 0xc001103b +/* Fam 15h MSRs */ +#define MSR_F15H_PERF_CTL 0xc0010200 +#define MSR_F15H_PERF_CTR 0xc0010201 + /* Fam 10h MSRs */ #define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058 #define FAM10H_MMIO_CONF_ENABLE (1<<0) diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 932f0f86b4b..c4021b95351 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -5,41 +5,15 @@ #include <asm/irq.h> #include <asm/io.h> -#ifdef ARCH_HAS_NMI_WATCHDOG - -/** - * do_nmi_callback - * - * Check to see if a callback exists and execute it. Return 1 - * if the handler exists and was handled successfully. - */ -int do_nmi_callback(struct pt_regs *regs, int cpu); +#ifdef CONFIG_X86_LOCAL_APIC extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); -extern int check_nmi_watchdog(void); -#if !defined(CONFIG_LOCKUP_DETECTOR) -extern int nmi_watchdog_enabled; -#endif extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); extern int reserve_perfctr_nmi(unsigned int); extern void release_perfctr_nmi(unsigned int); extern int reserve_evntsel_nmi(unsigned int); extern void release_evntsel_nmi(unsigned int); -extern void setup_apic_nmi_watchdog(void *); -extern void stop_apic_nmi_watchdog(void *); -extern void disable_timer_nmi_watchdog(void); -extern void enable_timer_nmi_watchdog(void); -extern int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason); -extern void cpu_nmi_set_wd_enabled(void); - -extern atomic_t nmi_active; -extern unsigned int nmi_watchdog; -#define NMI_NONE 0 -#define NMI_IO_APIC 1 -#define NMI_LOCAL_APIC 2 -#define NMI_INVALID 3 - struct ctl_table; extern int proc_nmi_enabled(struct ctl_table *, int , void __user *, size_t *, loff_t *); @@ -47,33 +21,8 @@ extern int unknown_nmi_panic; void arch_trigger_all_cpu_backtrace(void); #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace - -static inline void localise_nmi_watchdog(void) -{ - if (nmi_watchdog == NMI_IO_APIC) - nmi_watchdog = NMI_LOCAL_APIC; -} - -/* check if nmi_watchdog is active (ie was specified at boot) */ -static inline int nmi_watchdog_active(void) -{ - /* - * actually it should be: - * return (nmi_watchdog == NMI_LOCAL_APIC || - * nmi_watchdog == NMI_IO_APIC) - * but since they are power of two we could use a - * cheaper way --cvg - */ - return nmi_watchdog & (NMI_LOCAL_APIC | NMI_IO_APIC); -} #endif -void lapic_watchdog_stop(void); -int lapic_watchdog_init(unsigned nmi_hz); -int lapic_wd_event(unsigned nmi_hz); -unsigned lapic_adjust_nmi_hz(unsigned hz); -void disable_lapic_nmi_watchdog(void); -void enable_lapic_nmi_watchdog(void); void stop_nmi(void); void restart_nmi(void); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 550e26b1dbb..d9d4dae305f 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -125,7 +125,6 @@ union cpuid10_edx { #define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */ #ifdef CONFIG_PERF_EVENTS -extern void init_hw_perf_events(void); extern void perf_events_lapic_init(void); #define PERF_EVENT_INDEX_OFFSET 0 @@ -156,7 +155,6 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs); } #else -static inline void init_hw_perf_events(void) { } static inline void perf_events_lapic_init(void) { } #endif diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h index a70cd216be5..295e2ff18a6 100644 --- a/arch/x86/include/asm/perf_event_p4.h +++ b/arch/x86/include/asm/perf_event_p4.h @@ -744,14 +744,6 @@ enum P4_ESCR_EMASKS { }; /* - * P4 PEBS specifics (Replay Event only) - * - * Format (bits): - * 0-6: metric from P4_PEBS_METRIC enum - * 7 : reserved - * 8 : reserved - * 9-11 : reserved - * * Note we have UOP and PEBS bits reserved for now * just in case if we will need them once */ @@ -788,5 +780,60 @@ enum P4_PEBS_METRIC { P4_PEBS_METRIC__max }; +/* + * Notes on internal configuration of ESCR+CCCR tuples + * + * Since P4 has quite the different architecture of + * performance registers in compare with "architectural" + * once and we have on 64 bits to keep configuration + * of performance event, the following trick is used. + * + * 1) Since both ESCR and CCCR registers have only low + * 32 bits valuable, we pack them into a single 64 bit + * configuration. Low 32 bits of such config correspond + * to low 32 bits of CCCR register and high 32 bits + * correspond to low 32 bits of ESCR register. + * + * 2) The meaning of every bit of such config field can + * be found in Intel SDM but it should be noted that + * we "borrow" some reserved bits for own usage and + * clean them or set to a proper value when we do + * a real write to hardware registers. + * + * 3) The format of bits of config is the following + * and should be either 0 or set to some predefined + * values: + * + * Low 32 bits + * ----------- + * 0-6: P4_PEBS_METRIC enum + * 7-11: reserved + * 12: reserved (Enable) + * 13-15: reserved (ESCR select) + * 16-17: Active Thread + * 18: Compare + * 19: Complement + * 20-23: Threshold + * 24: Edge + * 25: reserved (FORCE_OVF) + * 26: reserved (OVF_PMI_T0) + * 27: reserved (OVF_PMI_T1) + * 28-29: reserved + * 30: reserved (Cascade) + * 31: reserved (OVF) + * + * High 32 bits + * ------------ + * 0: reserved (T1_USR) + * 1: reserved (T1_OS) + * 2: reserved (T0_USR) + * 3: reserved (T0_OS) + * 4: Tag Enable + * 5-8: Tag Value + * 9-24: Event Mask (may use P4_ESCR_EMASK_BIT helper) + * 25-30: enum P4_EVENTS + * 31: reserved (HT thread) + */ + #endif /* PERF_EVENT_P4_H */ diff --git a/arch/x86/include/asm/smpboot_hooks.h b/arch/x86/include/asm/smpboot_hooks.h index 1def6011490..6c22bf353f2 100644 --- a/arch/x86/include/asm/smpboot_hooks.h +++ b/arch/x86/include/asm/smpboot_hooks.h @@ -48,7 +48,6 @@ static inline void __init smpboot_setup_io_apic(void) setup_IO_APIC(); else { nr_ioapics = 0; - localise_nmi_watchdog(); } #endif } diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 2b16a2ad23d..52b5c7ed360 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -7,6 +7,7 @@ #define _ASM_X86_STACKTRACE_H #include <linux/uaccess.h> +#include <linux/ptrace.h> extern int kstack_depth_to_print; @@ -46,7 +47,7 @@ struct stacktrace_ops { }; void dump_trace(struct task_struct *tsk, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, + unsigned long *stack, const struct stacktrace_ops *ops, void *data); #ifdef CONFIG_X86_32 @@ -57,13 +58,39 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, #define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) #endif +#ifdef CONFIG_FRAME_POINTER +static inline unsigned long +stack_frame(struct task_struct *task, struct pt_regs *regs) +{ + unsigned long bp; + + if (regs) + return regs->bp; + + if (task == current) { + /* Grab bp right from our regs */ + get_bp(bp); + return bp; + } + + /* bp is the last reg pushed by switch_to */ + return *(unsigned long *)task->thread.sp; +} +#else +static inline unsigned long +stack_frame(struct task_struct *task, struct pt_regs *regs) +{ + return 0; +} +#endif + extern void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl); + unsigned long *stack, char *log_lvl); extern void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl); + unsigned long *sp, char *log_lvl); extern unsigned int code_bytes; diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 5469630b27f..fa7b9176b76 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -10,12 +10,6 @@ unsigned long long native_sched_clock(void); extern int recalibrate_cpu_khz(void); -#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) -extern int timer_ack; -#else -# define timer_ack (0) -#endif - extern int no_timer_check; /* Accelerators for sched_clock() diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 5079f24c955..553d0b0d639 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -591,17 +591,21 @@ static atomic_t stop_machine_first; static int wrote_text; struct text_poke_params { - void *addr; - const void *opcode; - size_t len; + struct text_poke_param *params; + int nparams; }; static int __kprobes stop_machine_text_poke(void *data) { struct text_poke_params *tpp = data; + struct text_poke_param *p; + int i; if (atomic_dec_and_test(&stop_machine_first)) { - text_poke(tpp->addr, tpp->opcode, tpp->len); + for (i = 0; i < tpp->nparams; i++) { + p = &tpp->params[i]; + text_poke(p->addr, p->opcode, p->len); + } smp_wmb(); /* Make sure other cpus see that this has run */ wrote_text = 1; } else { @@ -610,8 +614,12 @@ static int __kprobes stop_machine_text_poke(void *data) smp_mb(); /* Load wrote_text before following execution */ } - flush_icache_range((unsigned long)tpp->addr, - (unsigned long)tpp->addr + tpp->len); + for (i = 0; i < tpp->nparams; i++) { + p = &tpp->params[i]; + flush_icache_range((unsigned long)p->addr, + (unsigned long)p->addr + p->len); + } + return 0; } @@ -631,10 +639,13 @@ static int __kprobes stop_machine_text_poke(void *data) void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len) { struct text_poke_params tpp; + struct text_poke_param p; - tpp.addr = addr; - tpp.opcode = opcode; - tpp.len = len; + p.addr = addr; + p.opcode = opcode; + p.len = len; + tpp.params = &p; + tpp.nparams = 1; atomic_set(&stop_machine_first, 1); wrote_text = 0; /* Use __stop_machine() because the caller already got online_cpus. */ @@ -642,6 +653,26 @@ void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len) return addr; } +/** + * text_poke_smp_batch - Update instructions on a live kernel on SMP + * @params: an array of text_poke parameters + * @n: the number of elements in params. + * + * Modify multi-byte instruction by using stop_machine() on SMP. Since the + * stop_machine() is heavy task, it is better to aggregate text_poke requests + * and do it once if possible. + * + * Note: Must be called under get_online_cpus() and text_mutex. + */ +void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n) +{ + struct text_poke_params tpp = {.params = params, .nparams = n}; + + atomic_set(&stop_machine_first, 1); + wrote_text = 0; + stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); +} + #if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 8f6463d8ed0..affacb5e006 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -12,95 +12,116 @@ static u32 *flush_words; -struct pci_device_id k8_nb_ids[] = { +struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) }, {} }; -EXPORT_SYMBOL(k8_nb_ids); +EXPORT_SYMBOL(amd_nb_misc_ids); -struct k8_northbridge_info k8_northbridges; -EXPORT_SYMBOL(k8_northbridges); +struct amd_northbridge_info amd_northbridges; +EXPORT_SYMBOL(amd_northbridges); -static struct pci_dev *next_k8_northbridge(struct pci_dev *dev) +static struct pci_dev *next_northbridge(struct pci_dev *dev, + struct pci_device_id *ids) { do { dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); if (!dev) break; - } while (!pci_match_id(&k8_nb_ids[0], dev)); + } while (!pci_match_id(ids, dev)); return dev; } -int cache_k8_northbridges(void) +int amd_cache_northbridges(void) { - int i; - struct pci_dev *dev; + int i = 0; + struct amd_northbridge *nb; + struct pci_dev *misc; - if (k8_northbridges.num) + if (amd_nb_num()) return 0; - dev = NULL; - while ((dev = next_k8_northbridge(dev)) != NULL) - k8_northbridges.num++; + misc = NULL; + while ((misc = next_northbridge(misc, amd_nb_misc_ids)) != NULL) + i++; - /* some CPU families (e.g. family 0x11) do not support GART */ - if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || - boot_cpu_data.x86 == 0x15) - k8_northbridges.gart_supported = 1; + if (i == 0) + return 0; - k8_northbridges.nb_misc = kmalloc((k8_northbridges.num + 1) * - sizeof(void *), GFP_KERNEL); - if (!k8_northbridges.nb_misc) + nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL); + if (!nb) return -ENOMEM; - if (!k8_northbridges.num) { - k8_northbridges.nb_misc[0] = NULL; - return 0; - } + amd_northbridges.nb = nb; + amd_northbridges.num = i; - if (k8_northbridges.gart_supported) { - flush_words = kmalloc(k8_northbridges.num * sizeof(u32), - GFP_KERNEL); - if (!flush_words) { - kfree(k8_northbridges.nb_misc); - return -ENOMEM; - } - } + misc = NULL; + for (i = 0; i != amd_nb_num(); i++) { + node_to_amd_nb(i)->misc = misc = + next_northbridge(misc, amd_nb_misc_ids); + } + + /* some CPU families (e.g. family 0x11) do not support GART */ + if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || + boot_cpu_data.x86 == 0x15) + amd_northbridges.flags |= AMD_NB_GART; + + /* + * Some CPU families support L3 Cache Index Disable. There are some + * limitations because of E382 and E388 on family 0x10. + */ + if (boot_cpu_data.x86 == 0x10 && + boot_cpu_data.x86_model >= 0x8 && + (boot_cpu_data.x86_model > 0x9 || + boot_cpu_data.x86_mask >= 0x1)) + amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE; - dev = NULL; - i = 0; - while ((dev = next_k8_northbridge(dev)) != NULL) { - k8_northbridges.nb_misc[i] = dev; - if (k8_northbridges.gart_supported) - pci_read_config_dword(dev, 0x9c, &flush_words[i++]); - } - k8_northbridges.nb_misc[i] = NULL; return 0; } -EXPORT_SYMBOL_GPL(cache_k8_northbridges); +EXPORT_SYMBOL_GPL(amd_cache_northbridges); /* Ignores subdevice/subvendor but as far as I can figure out they're useless anyways */ -int __init early_is_k8_nb(u32 device) +int __init early_is_amd_nb(u32 device) { struct pci_device_id *id; u32 vendor = device & 0xffff; device >>= 16; - for (id = k8_nb_ids; id->vendor; id++) + for (id = amd_nb_misc_ids; id->vendor; id++) if (vendor == id->vendor && device == id->device) return 1; return 0; } -void k8_flush_garts(void) +int amd_cache_gart(void) +{ + int i; + + if (!amd_nb_has_feature(AMD_NB_GART)) + return 0; + + flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL); + if (!flush_words) { + amd_northbridges.flags &= ~AMD_NB_GART; + return -ENOMEM; + } + + for (i = 0; i != amd_nb_num(); i++) + pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c, + &flush_words[i]); + + return 0; +} + +void amd_flush_garts(void) { int flushed, i; unsigned long flags; static DEFINE_SPINLOCK(gart_lock); - if (!k8_northbridges.gart_supported) + if (!amd_nb_has_feature(AMD_NB_GART)) return; /* Avoid races between AGP and IOMMU. In theory it's not needed @@ -109,16 +130,16 @@ void k8_flush_garts(void) that it doesn't matter to serialize more. -AK */ spin_lock_irqsave(&gart_lock, flags); flushed = 0; - for (i = 0; i < k8_northbridges.num; i++) { - pci_write_config_dword(k8_northbridges.nb_misc[i], 0x9c, - flush_words[i]|1); + for (i = 0; i < amd_nb_num(); i++) { + pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c, + flush_words[i] | 1); flushed++; } - for (i = 0; i < k8_northbridges.num; i++) { + for (i = 0; i < amd_nb_num(); i++) { u32 w; /* Make sure the hardware actually executed the flush*/ for (;;) { - pci_read_config_dword(k8_northbridges.nb_misc[i], + pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c, &w); if (!(w & 1)) break; @@ -129,19 +150,23 @@ void k8_flush_garts(void) if (!flushed) printk("nothing to flush?\n"); } -EXPORT_SYMBOL_GPL(k8_flush_garts); +EXPORT_SYMBOL_GPL(amd_flush_garts); -static __init int init_k8_nbs(void) +static __init int init_amd_nbs(void) { int err = 0; - err = cache_k8_northbridges(); + err = amd_cache_northbridges(); if (err < 0) - printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n"); + printk(KERN_NOTICE "AMD NB: Cannot enumerate AMD northbridges.\n"); + + if (amd_cache_gart() < 0) + printk(KERN_NOTICE "AMD NB: Cannot initialize GART flush words, " + "GART support disabled.\n"); return err; } /* This has to go after the PCI subsystem */ -fs_initcall(init_k8_nbs); +fs_initcall(init_amd_nbs); diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index b3a16e8f070..dcd7c83e165 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -206,7 +206,7 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order) * Do an PCI bus scan by hand because we're running before the PCI * subsystem. * - * All K8 AGP bridges are AGPv3 compliant, so we can do this scan + * All AMD AGP bridges are AGPv3 compliant, so we can do this scan * generically. It's probably overkill to always scan all slots because * the AGP bridges should be always an own bus on the HT hierarchy, * but do it here for future safety. @@ -303,7 +303,7 @@ void __init early_gart_iommu_check(void) dev_limit = bus_dev_ranges[i].dev_limit; for (slot = dev_base; slot < dev_limit; slot++) { - if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) continue; ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); @@ -358,7 +358,7 @@ void __init early_gart_iommu_check(void) dev_limit = bus_dev_ranges[i].dev_limit; for (slot = dev_base; slot < dev_limit; slot++) { - if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) continue; ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); @@ -400,7 +400,7 @@ int __init gart_iommu_hole_init(void) dev_limit = bus_dev_ranges[i].dev_limit; for (slot = dev_base; slot < dev_limit; slot++) { - if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) continue; iommu_detected = 1; @@ -518,7 +518,7 @@ out: dev_base = bus_dev_ranges[i].dev_base; dev_limit = bus_dev_ranges[i].dev_limit; for (slot = dev_base; slot < dev_limit; slot++) { - if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) continue; write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 910f20b457c..3966b564ea4 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -3,10 +3,7 @@ # obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o -ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y) -obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o -endif -obj-$(CONFIG_HARDLOCKUP_DETECTOR) += hw_nmi.o +obj-y += hw_nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_SMP) += ipi.o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 30fdce8c5c1..316a3b6b112 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -31,7 +31,6 @@ #include <linux/init.h> #include <linux/cpu.h> #include <linux/dmi.h> -#include <linux/nmi.h> #include <linux/smp.h> #include <linux/mm.h> @@ -799,11 +798,7 @@ void __init setup_boot_APIC_clock(void) * PIT/HPET going. Otherwise register lapic as a dummy * device. */ - if (nmi_watchdog != NMI_IO_APIC) - lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; - else - pr_warning("APIC timer registered as dummy," - " due to nmi_watchdog=%d!\n", nmi_watchdog); + lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; /* Setup the lapic or request the broadcast */ setup_APIC_timer(); @@ -1387,7 +1382,6 @@ void __cpuinit end_local_APIC_setup(void) } #endif - setup_apic_nmi_watchdog(NULL); apic_pm_activate(); /* @@ -1779,17 +1773,10 @@ int __init APIC_init_uniprocessor(void) setup_IO_APIC(); else { nr_ioapics = 0; - localise_nmi_watchdog(); } -#else - localise_nmi_watchdog(); #endif x86_init.timers.setup_percpu_clockev(); -#ifdef CONFIG_X86_64 - check_nmi_watchdog(); -#endif - return 0; } diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 62f6e1e55b9..72ec29e1ae0 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -17,20 +17,31 @@ #include <linux/nmi.h> #include <linux/module.h> +#ifdef CONFIG_HARDLOCKUP_DETECTOR u64 hw_nmi_get_sample_period(void) { return (u64)(cpu_khz) * 1000 * 60; } +#endif -#ifdef ARCH_HAS_NMI_WATCHDOG - +#ifdef arch_trigger_all_cpu_backtrace /* For reliability, we're prepared to waste bits here. */ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; +/* "in progress" flag of arch_trigger_all_cpu_backtrace */ +static unsigned long backtrace_flag; + void arch_trigger_all_cpu_backtrace(void) { int i; + if (test_and_set_bit(0, &backtrace_flag)) + /* + * If there is already a trigger_all_cpu_backtrace() in progress + * (backtrace_flag == 1), don't output double cpu dump infos. + */ + return; + cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); printk(KERN_INFO "sending NMI to all CPUs:\n"); @@ -42,6 +53,9 @@ void arch_trigger_all_cpu_backtrace(void) break; mdelay(1); } + + clear_bit(0, &backtrace_flag); + smp_mb__after_clear_bit(); } static int __kprobes @@ -50,7 +64,7 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self, { struct die_args *args = __args; struct pt_regs *regs; - int cpu = smp_processor_id(); + int cpu; switch (cmd) { case DIE_NMI: @@ -62,6 +76,7 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self, } regs = args->regs; + cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED; @@ -91,18 +106,3 @@ static int __init register_trigger_all_cpu_backtrace(void) } early_initcall(register_trigger_all_cpu_backtrace); #endif - -/* STUB calls to mimic old nmi_watchdog behaviour */ -#if defined(CONFIG_X86_LOCAL_APIC) -unsigned int nmi_watchdog = NMI_NONE; -EXPORT_SYMBOL(nmi_watchdog); -void acpi_nmi_enable(void) { return; } -void acpi_nmi_disable(void) { return; } -#endif -atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ -EXPORT_SYMBOL(nmi_active); -int unknown_nmi_panic; -void cpu_nmi_set_wd_enabled(void) { return; } -void stop_apic_nmi_watchdog(void *unused) { return; } -void setup_apic_nmi_watchdog(void *unused) { return; } -int __init check_nmi_watchdog(void) { return 0; } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 2fc696e4d56..f6cd5b41077 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -54,7 +54,6 @@ #include <asm/dma.h> #include <asm/timer.h> #include <asm/i8259.h> -#include <asm/nmi.h> #include <asm/msidef.h> #include <asm/hypertransport.h> #include <asm/setup.h> @@ -2646,24 +2645,6 @@ static void lapic_register_intr(int irq) "edge"); } -static void __init setup_nmi(void) -{ - /* - * Dirty trick to enable the NMI watchdog ... - * We put the 8259A master into AEOI mode and - * unmask on all local APICs LVT0 as NMI. - * - * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') - * is from Maciej W. Rozycki - so we do not have to EOI from - * the NMI handler or the timer interrupt. - */ - apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); - - enable_NMI_through_LVT0(); - - apic_printk(APIC_VERBOSE, " done.\n"); -} - /* * This looks a bit hackish but it's about the only one way of sending * a few INTA cycles to 8259As and any associated glue logic. ICR does @@ -2769,15 +2750,6 @@ static inline void __init check_timer(void) */ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); legacy_pic->init(1); -#ifdef CONFIG_X86_32 - { - unsigned int ver; - - ver = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(ver); - timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); - } -#endif pin1 = find_isa_irq_pin(0, mp_INT); apic1 = find_isa_irq_apic(0, mp_INT); @@ -2825,10 +2797,6 @@ static inline void __init check_timer(void) unmask_ioapic(cfg); } if (timer_irq_works()) { - if (nmi_watchdog == NMI_IO_APIC) { - setup_nmi(); - legacy_pic->unmask(0); - } if (disable_timer_pin_1 > 0) clear_IO_APIC_pin(0, pin1); goto out; @@ -2854,11 +2822,6 @@ static inline void __init check_timer(void) if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); timer_through_8259 = 1; - if (nmi_watchdog == NMI_IO_APIC) { - legacy_pic->mask(0); - setup_nmi(); - legacy_pic->unmask(0); - } goto out; } /* @@ -2870,15 +2833,6 @@ static inline void __init check_timer(void) apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); } - if (nmi_watchdog == NMI_IO_APIC) { - apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work " - "through the IO-APIC - disabling NMI Watchdog!\n"); - nmi_watchdog = NMI_NONE; - } -#ifdef CONFIG_X86_32 - timer_ack = 0; -#endif - apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...\n"); diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c deleted file mode 100644 index c90041ccb74..00000000000 --- a/arch/x86/kernel/apic/nmi.c +++ /dev/null @@ -1,567 +0,0 @@ -/* - * NMI watchdog support on APIC systems - * - * Started by Ingo Molnar <mingo@redhat.com> - * - * Fixes: - * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. - * Mikael Pettersson : Power Management for local APIC NMI watchdog. - * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog. - * Pavel Machek and - * Mikael Pettersson : PM converted to driver model. Disable/enable API. - */ - -#include <asm/apic.h> - -#include <linux/nmi.h> -#include <linux/mm.h> -#include <linux/delay.h> -#include <linux/interrupt.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/sysdev.h> -#include <linux/sysctl.h> -#include <linux/percpu.h> -#include <linux/kprobes.h> -#include <linux/cpumask.h> -#include <linux/kernel_stat.h> -#include <linux/kdebug.h> -#include <linux/smp.h> - -#include <asm/i8259.h> -#include <asm/io_apic.h> -#include <asm/proto.h> -#include <asm/timer.h> - -#include <asm/mce.h> - -#include <asm/mach_traps.h> - -int unknown_nmi_panic; -int nmi_watchdog_enabled; - -/* For reliability, we're prepared to waste bits here. */ -static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; - -/* nmi_active: - * >0: the lapic NMI watchdog is active, but can be disabled - * <0: the lapic NMI watchdog has not been set up, and cannot - * be enabled - * 0: the lapic NMI watchdog is disabled, but can be enabled - */ -atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ -EXPORT_SYMBOL(nmi_active); - -unsigned int nmi_watchdog = NMI_NONE; -EXPORT_SYMBOL(nmi_watchdog); - -static int panic_on_timeout; - -static unsigned int nmi_hz = HZ; -static DEFINE_PER_CPU(short, wd_enabled); -static int endflag __initdata; - -static inline unsigned int get_nmi_count(int cpu) -{ - return per_cpu(irq_stat, cpu).__nmi_count; -} - -static inline int mce_in_progress(void) -{ -#if defined(CONFIG_X86_MCE) - return atomic_read(&mce_entry) > 0; -#endif - return 0; -} - -/* - * Take the local apic timer and PIT/HPET into account. We don't - * know which one is active, when we have highres/dyntick on - */ -static inline unsigned int get_timer_irqs(int cpu) -{ - return per_cpu(irq_stat, cpu).apic_timer_irqs + - per_cpu(irq_stat, cpu).irq0_irqs; -} - -#ifdef CONFIG_SMP -/* - * The performance counters used by NMI_LOCAL_APIC don't trigger when - * the CPU is idle. To make sure the NMI watchdog really ticks on all - * CPUs during the test make them busy. - */ -static __init void nmi_cpu_busy(void *data) -{ - local_irq_enable_in_hardirq(); - /* - * Intentionally don't use cpu_relax here. This is - * to make sure that the performance counter really ticks, - * even if there is a simulator or similar that catches the - * pause instruction. On a real HT machine this is fine because - * all other CPUs are busy with "useless" delay loops and don't - * care if they get somewhat less cycles. - */ - while (endflag == 0) - mb(); -} -#endif - -static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count) -{ - printk(KERN_CONT "\n"); - - printk(KERN_WARNING - "WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n", - cpu, prev_nmi_count[cpu], get_nmi_count(cpu)); - - printk(KERN_WARNING - "Please report this to bugzilla.kernel.org,\n"); - printk(KERN_WARNING - "and attach the output of the 'dmesg' command.\n"); - - per_cpu(wd_enabled, cpu) = 0; - atomic_dec(&nmi_active); -} - -static void __acpi_nmi_disable(void *__unused) -{ - apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); -} - -int __init check_nmi_watchdog(void) -{ - unsigned int *prev_nmi_count; - int cpu; - - if (!nmi_watchdog_active() || !atomic_read(&nmi_active)) - return 0; - - prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL); - if (!prev_nmi_count) - goto error; - - printk(KERN_INFO "Testing NMI watchdog ... "); - -#ifdef CONFIG_SMP - if (nmi_watchdog == NMI_LOCAL_APIC) - smp_call_function(nmi_cpu_busy, (void *)&endflag, 0); -#endif - - for_each_possible_cpu(cpu) - prev_nmi_count[cpu] = get_nmi_count(cpu); - local_irq_enable(); - mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */ - - for_each_online_cpu(cpu) { - if (!per_cpu(wd_enabled, cpu)) - continue; - if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) - report_broken_nmi(cpu, prev_nmi_count); - } - endflag = 1; - if (!atomic_read(&nmi_active)) { - kfree(prev_nmi_count); - atomic_set(&nmi_active, -1); - goto error; - } - printk("OK.\n"); - - /* - * now that we know it works we can reduce NMI frequency to - * something more reasonable; makes a difference in some configs - */ - if (nmi_watchdog == NMI_LOCAL_APIC) - nmi_hz = lapic_adjust_nmi_hz(1); - - kfree(prev_nmi_count); - return 0; -error: - if (nmi_watchdog == NMI_IO_APIC) { - if (!timer_through_8259) - legacy_pic->mask(0); - on_each_cpu(__acpi_nmi_disable, NULL, 1); - } - -#ifdef CONFIG_X86_32 - timer_ack = 0; -#endif - return -1; -} - -static int __init setup_nmi_watchdog(char *str) -{ - unsigned int nmi; - - if (!strncmp(str, "panic", 5)) { - panic_on_timeout = 1; - str = strchr(str, ','); - if (!str) - return 1; - ++str; - } - - if (!strncmp(str, "lapic", 5)) - nmi_watchdog = NMI_LOCAL_APIC; - else if (!strncmp(str, "ioapic", 6)) - nmi_watchdog = NMI_IO_APIC; - else { - get_option(&str, &nmi); - if (nmi >= NMI_INVALID) - return 0; - nmi_watchdog = nmi; - } - - return 1; -} -__setup("nmi_watchdog=", setup_nmi_watchdog); - -/* - * Suspend/resume support - */ -#ifdef CONFIG_PM - -static int nmi_pm_active; /* nmi_active before suspend */ - -static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) -{ - /* only CPU0 goes here, other CPUs should be offline */ - nmi_pm_active = atomic_read(&nmi_active); - stop_apic_nmi_watchdog(NULL); - BUG_ON(atomic_read(&nmi_active) != 0); - return 0; -} - -static int lapic_nmi_resume(struct sys_device *dev) -{ - /* only CPU0 goes here, other CPUs should be offline */ - if (nmi_pm_active > 0) { - setup_apic_nmi_watchdog(NULL); - touch_nmi_watchdog(); - } - return 0; -} - -static struct sysdev_class nmi_sysclass = { - .name = "lapic_nmi", - .resume = lapic_nmi_resume, - .suspend = lapic_nmi_suspend, -}; - -static struct sys_device device_lapic_nmi = { - .id = 0, - .cls = &nmi_sysclass, -}; - -static int __init init_lapic_nmi_sysfs(void) -{ - int error; - - /* - * should really be a BUG_ON but b/c this is an - * init call, it just doesn't work. -dcz - */ - if (nmi_watchdog != NMI_LOCAL_APIC) - return 0; - - if (atomic_read(&nmi_active) < 0) - return 0; - - error = sysdev_class_register(&nmi_sysclass); - if (!error) - error = sysdev_register(&device_lapic_nmi); - return error; -} - -/* must come after the local APIC's device_initcall() */ -late_initcall(init_lapic_nmi_sysfs); - -#endif /* CONFIG_PM */ - -static void __acpi_nmi_enable(void *__unused) -{ - apic_write(APIC_LVT0, APIC_DM_NMI); -} - -/* - * Enable timer based NMIs on all CPUs: - */ -void acpi_nmi_enable(void) -{ - if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_enable, NULL, 1); -} - -/* - * Disable timer based NMIs on all CPUs: - */ -void acpi_nmi_disable(void) -{ - if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_disable, NULL, 1); -} - -/* - * This function is called as soon the LAPIC NMI watchdog driver has everything - * in place and it's ready to check if the NMIs belong to the NMI watchdog - */ -void cpu_nmi_set_wd_enabled(void) -{ - __get_cpu_var(wd_enabled) = 1; -} - -void setup_apic_nmi_watchdog(void *unused) -{ - if (__get_cpu_var(wd_enabled)) - return; - - /* cheap hack to support suspend/resume */ - /* if cpu0 is not active neither should the other cpus */ - if (smp_processor_id() != 0 && atomic_read(&nmi_active) <= 0) - return; - - switch (nmi_watchdog) { - case NMI_LOCAL_APIC: - if (lapic_watchdog_init(nmi_hz) < 0) { - __get_cpu_var(wd_enabled) = 0; - return; - } - /* FALL THROUGH */ - case NMI_IO_APIC: - __get_cpu_var(wd_enabled) = 1; - atomic_inc(&nmi_active); - } -} - -void stop_apic_nmi_watchdog(void *unused) -{ - /* only support LOCAL and IO APICs for now */ - if (!nmi_watchdog_active()) - return; - if (__get_cpu_var(wd_enabled) == 0) - return; - if (nmi_watchdog == NMI_LOCAL_APIC) - lapic_watchdog_stop(); - else - __acpi_nmi_disable(NULL); - __get_cpu_var(wd_enabled) = 0; - atomic_dec(&nmi_active); -} - -/* - * the best way to detect whether a CPU has a 'hard lockup' problem - * is to check it's local APIC timer IRQ counts. If they are not - * changing then that CPU has some problem. - * - * as these watchdog NMI IRQs are generated on every CPU, we only - * have to check the current processor. - * - * since NMIs don't listen to _any_ locks, we have to be extremely - * careful not to rely on unsafe variables. The printk might lock - * up though, so we have to break up any console locks first ... - * [when there will be more tty-related locks, break them up here too!] - */ - -static DEFINE_PER_CPU(unsigned, last_irq_sum); -static DEFINE_PER_CPU(long, alert_counter); -static DEFINE_PER_CPU(int, nmi_touch); - -void touch_nmi_watchdog(void) -{ - if (nmi_watchdog_active()) { - unsigned cpu; - - /* - * Tell other CPUs to reset their alert counters. We cannot - * do it ourselves because the alert count increase is not - * atomic. - */ - for_each_present_cpu(cpu) { - if (per_cpu(nmi_touch, cpu) != 1) - per_cpu(nmi_touch, cpu) = 1; - } - } - - /* - * Tickle the softlockup detector too: - */ - touch_softlockup_watchdog(); -} -EXPORT_SYMBOL(touch_nmi_watchdog); - -notrace __kprobes int -nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) -{ - /* - * Since current_thread_info()-> is always on the stack, and we - * always switch the stack NMI-atomically, it's safe to use - * smp_processor_id(). - */ - unsigned int sum; - int touched = 0; - int cpu = smp_processor_id(); - int rc = 0; - - sum = get_timer_irqs(cpu); - - if (__get_cpu_var(nmi_touch)) { - __get_cpu_var(nmi_touch) = 0; - touched = 1; - } - - /* We can be called before check_nmi_watchdog, hence NULL check. */ - if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { - static DEFINE_RAW_SPINLOCK(lock); /* Serialise the printks */ - - raw_spin_lock(&lock); - printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); - show_regs(regs); - dump_stack(); - raw_spin_unlock(&lock); - cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); - - rc = 1; - } - - /* Could check oops_in_progress here too, but it's safer not to */ - if (mce_in_progress()) - touched = 1; - - /* if the none of the timers isn't firing, this cpu isn't doing much */ - if (!touched && __get_cpu_var(last_irq_sum) == sum) { - /* - * Ayiee, looks like this CPU is stuck ... - * wait a few IRQs (5 seconds) before doing the oops ... - */ - __this_cpu_inc(alert_counter); - if (__this_cpu_read(alert_counter) == 5 * nmi_hz) - /* - * die_nmi will return ONLY if NOTIFY_STOP happens.. - */ - die_nmi("BUG: NMI Watchdog detected LOCKUP", - regs, panic_on_timeout); - } else { - __get_cpu_var(last_irq_sum) = sum; - __this_cpu_write(alert_counter, 0); - } - - /* see if the nmi watchdog went off */ - if (!__get_cpu_var(wd_enabled)) - return rc; - switch (nmi_watchdog) { - case NMI_LOCAL_APIC: - rc |= lapic_wd_event(nmi_hz); - break; - case NMI_IO_APIC: - /* - * don't know how to accurately check for this. - * just assume it was a watchdog timer interrupt - * This matches the old behaviour. - */ - rc = 1; - break; - } - return rc; -} - -#ifdef CONFIG_SYSCTL - -static void enable_ioapic_nmi_watchdog_single(void *unused) -{ - __get_cpu_var(wd_enabled) = 1; - atomic_inc(&nmi_active); - __acpi_nmi_enable(NULL); -} - -static void enable_ioapic_nmi_watchdog(void) -{ - on_each_cpu(enable_ioapic_nmi_watchdog_single, NULL, 1); - touch_nmi_watchdog(); -} - -static void disable_ioapic_nmi_watchdog(void) -{ - on_each_cpu(stop_apic_nmi_watchdog, NULL, 1); -} - -static int __init setup_unknown_nmi_panic(char *str) -{ - unknown_nmi_panic = 1; - return 1; -} -__setup("unknown_nmi_panic", setup_unknown_nmi_panic); - -static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) -{ - unsigned char reason = get_nmi_reason(); - char buf[64]; - - sprintf(buf, "NMI received for unknown reason %02x\n", reason); - die_nmi(buf, regs, 1); /* Always panic here */ - return 0; -} - -/* - * proc handler for /proc/sys/kernel/nmi - */ -int proc_nmi_enabled(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) -{ - int old_state; - - nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; - old_state = nmi_watchdog_enabled; - proc_dointvec(table, write, buffer, length, ppos); - if (!!old_state == !!nmi_watchdog_enabled) - return 0; - - if (atomic_read(&nmi_active) < 0 || !nmi_watchdog_active()) { - printk(KERN_WARNING - "NMI watchdog is permanently disabled\n"); - return -EIO; - } - - if (nmi_watchdog == NMI_LOCAL_APIC) { - if (nmi_watchdog_enabled) - enable_lapic_nmi_watchdog(); - else - disable_lapic_nmi_watchdog(); - } else if (nmi_watchdog == NMI_IO_APIC) { - if (nmi_watchdog_enabled) - enable_ioapic_nmi_watchdog(); - else - disable_ioapic_nmi_watchdog(); - } else { - printk(KERN_WARNING - "NMI watchdog doesn't know what hardware to touch\n"); - return -EIO; - } - return 0; -} - -#endif /* CONFIG_SYSCTL */ - -int do_nmi_callback(struct pt_regs *regs, int cpu) -{ -#ifdef CONFIG_SYSCTL - if (unknown_nmi_panic) - return unknown_nmi_panic_callback(regs, cpu); -#endif - return 0; -} - -void arch_trigger_all_cpu_backtrace(void) -{ - int i; - - cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); - - printk(KERN_INFO "sending NMI to all CPUs:\n"); - apic->send_IPI_all(NMI_VECTOR); - - /* Wait for up to 10 seconds for all CPUs to do the backtrace */ - for (i = 0; i < 10 * 1000; i++) { - if (cpumask_empty(to_cpumask(backtrace_mask))) - break; - mdelay(1); - } -} diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4b68bda3093..1d59834396b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -894,7 +894,6 @@ void __init identify_boot_cpu(void) #else vgetcpu_set_mode(); #endif - init_hw_perf_events(); } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 17ad0336621..9ecf81f9b90 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -149,8 +149,7 @@ union _cpuid4_leaf_ecx { }; struct amd_l3_cache { - struct pci_dev *dev; - bool can_disable; + struct amd_northbridge *nb; unsigned indices; u8 subcaches[4]; }; @@ -311,14 +310,12 @@ struct _cache_attr { /* * L3 cache descriptors */ -static struct amd_l3_cache **__cpuinitdata l3_caches; - static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3) { unsigned int sc0, sc1, sc2, sc3; u32 val = 0; - pci_read_config_dword(l3->dev, 0x1C4, &val); + pci_read_config_dword(l3->nb->misc, 0x1C4, &val); /* calculate subcache sizes */ l3->subcaches[0] = sc0 = !(val & BIT(0)); @@ -330,47 +327,14 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3) l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; } -static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node) -{ - struct amd_l3_cache *l3; - struct pci_dev *dev = node_to_k8_nb_misc(node); - - l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC); - if (!l3) { - printk(KERN_WARNING "Error allocating L3 struct\n"); - return NULL; - } - - l3->dev = dev; - - amd_calc_l3_indices(l3); - - return l3; -} - -static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, - int index) +static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, + int index) { + static struct amd_l3_cache *__cpuinitdata l3_caches; int node; - if (boot_cpu_data.x86 != 0x10) - return; - - if (index < 3) - return; - - /* see errata #382 and #388 */ - if (boot_cpu_data.x86_model < 0x8) - return; - - if ((boot_cpu_data.x86_model == 0x8 || - boot_cpu_data.x86_model == 0x9) - && - boot_cpu_data.x86_mask < 0x1) - return; - - /* not in virtualized environments */ - if (k8_northbridges.num == 0) + /* only for L3, and not in virtualized environments */ + if (index < 3 || amd_nb_num() == 0) return; /* @@ -378,7 +342,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, * never freed but this is done only on shutdown so it doesn't matter. */ if (!l3_caches) { - int size = k8_northbridges.num * sizeof(struct amd_l3_cache *); + int size = amd_nb_num() * sizeof(struct amd_l3_cache); l3_caches = kzalloc(size, GFP_ATOMIC); if (!l3_caches) @@ -387,14 +351,12 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, node = amd_get_nb_id(smp_processor_id()); - if (!l3_caches[node]) { - l3_caches[node] = amd_init_l3_cache(node); - l3_caches[node]->can_disable = true; + if (!l3_caches[node].nb) { + l3_caches[node].nb = node_to_amd_nb(node); + amd_calc_l3_indices(&l3_caches[node]); } - WARN_ON(!l3_caches[node]); - - this_leaf->l3 = l3_caches[node]; + this_leaf->l3 = &l3_caches[node]; } /* @@ -408,7 +370,7 @@ int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot) { unsigned int reg = 0; - pci_read_config_dword(l3->dev, 0x1BC + slot * 4, ®); + pci_read_config_dword(l3->nb->misc, 0x1BC + slot * 4, ®); /* check whether this slot is activated already */ if (reg & (3UL << 30)) @@ -422,7 +384,8 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, { int index; - if (!this_leaf->l3 || !this_leaf->l3->can_disable) + if (!this_leaf->l3 || + !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) return -EINVAL; index = amd_get_l3_disable_slot(this_leaf->l3, slot); @@ -457,7 +420,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, if (!l3->subcaches[i]) continue; - pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg); + pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg); /* * We need to WBINVD on a core on the node containing the L3 @@ -467,7 +430,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, wbinvd_on_cpu(cpu); reg |= BIT(31); - pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg); + pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg); } } @@ -524,7 +487,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!this_leaf->l3 || !this_leaf->l3->can_disable) + if (!this_leaf->l3 || + !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) return -EINVAL; cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); @@ -545,7 +509,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, #define STORE_CACHE_DISABLE(slot) \ static ssize_t \ store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \ - const char *buf, size_t count) \ + const char *buf, size_t count) \ { \ return store_cache_disable(this_leaf, buf, count, slot); \ } @@ -558,10 +522,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, show_cache_disable_1, store_cache_disable_1); #else /* CONFIG_AMD_NB */ -static void __cpuinit -amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index) -{ -}; +#define amd_init_l3_cache(x, y) #endif /* CONFIG_AMD_NB */ static int @@ -575,7 +536,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index, if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { amd_cpuid4(index, &eax, &ebx, &ecx); - amd_check_l3_disable(this_leaf, index); + amd_init_l3_cache(this_leaf, index); } else { cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); } @@ -983,30 +944,48 @@ define_one_ro(size); define_one_ro(shared_cpu_map); define_one_ro(shared_cpu_list); -#define DEFAULT_SYSFS_CACHE_ATTRS \ - &type.attr, \ - &level.attr, \ - &coherency_line_size.attr, \ - &physical_line_partition.attr, \ - &ways_of_associativity.attr, \ - &number_of_sets.attr, \ - &size.attr, \ - &shared_cpu_map.attr, \ - &shared_cpu_list.attr - static struct attribute *default_attrs[] = { - DEFAULT_SYSFS_CACHE_ATTRS, + &type.attr, + &level.attr, + &coherency_line_size.attr, + &physical_line_partition.attr, + &ways_of_associativity.attr, + &number_of_sets.attr, + &size.attr, + &shared_cpu_map.attr, + &shared_cpu_list.attr, NULL }; -static struct attribute *default_l3_attrs[] = { - DEFAULT_SYSFS_CACHE_ATTRS, #ifdef CONFIG_AMD_NB - &cache_disable_0.attr, - &cache_disable_1.attr, +static struct attribute ** __cpuinit amd_l3_attrs(void) +{ + static struct attribute **attrs; + int n; + + if (attrs) + return attrs; + + n = sizeof (default_attrs) / sizeof (struct attribute *); + + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + n += 2; + + attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL); + if (attrs == NULL) + return attrs = default_attrs; + + for (n = 0; default_attrs[n]; n++) + attrs[n] = default_attrs[n]; + + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) { + attrs[n++] = &cache_disable_0.attr; + attrs[n++] = &cache_disable_1.attr; + } + + return attrs; +} #endif - NULL -}; static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) { @@ -1117,11 +1096,11 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) this_leaf = CPUID4_INFO_IDX(cpu, i); - if (this_leaf->l3 && this_leaf->l3->can_disable) - ktype_cache.default_attrs = default_l3_attrs; - else - ktype_cache.default_attrs = default_attrs; - + ktype_cache.default_attrs = default_attrs; +#ifdef CONFIG_AMD_NB + if (this_leaf->l3) + ktype_cache.default_attrs = amd_l3_attrs(); +#endif retval = kobject_init_and_add(&(this_object->kobj), &ktype_cache, per_cpu(ici_cache_kobject, cpu), diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 6d75b9145b1..0a360d14659 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -330,9 +330,6 @@ static bool reserve_pmc_hardware(void) { int i; - if (nmi_watchdog == NMI_LOCAL_APIC) - disable_lapic_nmi_watchdog(); - for (i = 0; i < x86_pmu.num_counters; i++) { if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) goto perfctr_fail; @@ -355,9 +352,6 @@ perfctr_fail: for (i--; i >= 0; i--) release_perfctr_nmi(x86_pmu.perfctr + i); - if (nmi_watchdog == NMI_LOCAL_APIC) - enable_lapic_nmi_watchdog(); - return false; } @@ -369,9 +363,6 @@ static void release_pmc_hardware(void) release_perfctr_nmi(x86_pmu.perfctr + i); release_evntsel_nmi(x86_pmu.eventsel + i); } - - if (nmi_watchdog == NMI_LOCAL_APIC) - enable_lapic_nmi_watchdog(); } #else @@ -384,15 +375,53 @@ static void release_pmc_hardware(void) {} static bool check_hw_exists(void) { u64 val, val_new = 0; - int ret = 0; + int i, reg, ret = 0; + + /* + * Check to see if the BIOS enabled any of the counters, if so + * complain and bail. + */ + for (i = 0; i < x86_pmu.num_counters; i++) { + reg = x86_pmu.eventsel + i; + ret = rdmsrl_safe(reg, &val); + if (ret) + goto msr_fail; + if (val & ARCH_PERFMON_EVENTSEL_ENABLE) + goto bios_fail; + } + if (x86_pmu.num_counters_fixed) { + reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; + ret = rdmsrl_safe(reg, &val); + if (ret) + goto msr_fail; + for (i = 0; i < x86_pmu.num_counters_fixed; i++) { + if (val & (0x03 << i*4)) + goto bios_fail; + } + } + + /* + * Now write a value and read it back to see if it matches, + * this is needed to detect certain hardware emulators (qemu/kvm) + * that don't trap on the MSR access and always return 0s. + */ val = 0xabcdUL; - ret |= checking_wrmsrl(x86_pmu.perfctr, val); + ret = checking_wrmsrl(x86_pmu.perfctr, val); ret |= rdmsrl_safe(x86_pmu.perfctr, &val_new); if (ret || val != val_new) - return false; + goto msr_fail; return true; + +bios_fail: + printk(KERN_CONT "Broken BIOS detected, using software events only.\n"); + printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val); + return false; + +msr_fail: + printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n"); + return false; } static void reserve_ds_buffers(void); @@ -451,7 +480,7 @@ static int x86_setup_perfctr(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; u64 config; - if (!hwc->sample_period) { + if (!is_sampling_event(event)) { hwc->sample_period = x86_pmu.max_period; hwc->last_period = hwc->sample_period; local64_set(&hwc->period_left, hwc->sample_period); @@ -1362,7 +1391,7 @@ static void __init pmu_check_apic(void) pr_info("no hardware sampling interrupt available.\n"); } -void __init init_hw_perf_events(void) +int __init init_hw_perf_events(void) { struct event_constraint *c; int err; @@ -1377,20 +1406,18 @@ void __init init_hw_perf_events(void) err = amd_pmu_init(); break; default: - return; + return 0; } if (err != 0) { pr_cont("no PMU driver, software events only.\n"); - return; + return 0; } pmu_check_apic(); /* sanity check that the hardware exists or is emulated */ - if (!check_hw_exists()) { - pr_cont("Broken PMU hardware detected, software events only.\n"); - return; - } + if (!check_hw_exists()) + return 0; pr_cont("%s PMU driver.\n", x86_pmu.name); @@ -1438,9 +1465,12 @@ void __init init_hw_perf_events(void) pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed); pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); - perf_pmu_register(&pmu); + perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); perf_cpu_notifier(x86_pmu_notifier); + + return 0; } +early_initcall(init_hw_perf_events); static inline void x86_pmu_read(struct perf_event *event) { @@ -1686,7 +1716,7 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) perf_callchain_store(entry, regs->ip); - dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); + dump_trace(NULL, regs, NULL, &backtrace_ops, entry); } #ifdef CONFIG_COMPAT diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index e421b8cd694..67e2202a603 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -1,7 +1,5 @@ #ifdef CONFIG_CPU_SUP_AMD -static DEFINE_RAW_SPINLOCK(amd_nb_lock); - static __initconst const u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -275,7 +273,7 @@ done: return &emptyconstraint; } -static struct amd_nb *amd_alloc_nb(int cpu, int nb_id) +static struct amd_nb *amd_alloc_nb(int cpu) { struct amd_nb *nb; int i; @@ -285,7 +283,7 @@ static struct amd_nb *amd_alloc_nb(int cpu, int nb_id) if (!nb) return NULL; - nb->nb_id = nb_id; + nb->nb_id = -1; /* * initialize all possible NB constraints @@ -306,7 +304,7 @@ static int amd_pmu_cpu_prepare(int cpu) if (boot_cpu_data.x86_max_cores < 2) return NOTIFY_OK; - cpuc->amd_nb = amd_alloc_nb(cpu, -1); + cpuc->amd_nb = amd_alloc_nb(cpu); if (!cpuc->amd_nb) return NOTIFY_BAD; @@ -325,8 +323,6 @@ static void amd_pmu_cpu_starting(int cpu) nb_id = amd_get_nb_id(cpu); WARN_ON_ONCE(nb_id == BAD_APICID); - raw_spin_lock(&amd_nb_lock); - for_each_online_cpu(i) { nb = per_cpu(cpu_hw_events, i).amd_nb; if (WARN_ON_ONCE(!nb)) @@ -341,8 +337,6 @@ static void amd_pmu_cpu_starting(int cpu) cpuc->amd_nb->nb_id = nb_id; cpuc->amd_nb->refcnt++; - - raw_spin_unlock(&amd_nb_lock); } static void amd_pmu_cpu_dead(int cpu) @@ -354,8 +348,6 @@ static void amd_pmu_cpu_dead(int cpu) cpuhw = &per_cpu(cpu_hw_events, cpu); - raw_spin_lock(&amd_nb_lock); - if (cpuhw->amd_nb) { struct amd_nb *nb = cpuhw->amd_nb; @@ -364,8 +356,6 @@ static void amd_pmu_cpu_dead(int cpu) cpuhw->amd_nb = NULL; } - - raw_spin_unlock(&amd_nb_lock); } static __initconst const struct x86_pmu amd_pmu = { diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index c8f5c088cad..24e390e40f2 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -816,6 +816,32 @@ static int intel_pmu_hw_config(struct perf_event *event) if (ret) return ret; + if (event->attr.precise_ip && + (event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { + /* + * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P + * (0x003c) so that we can use it with PEBS. + * + * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't + * PEBS capable. However we can use INST_RETIRED.ANY_P + * (0x00c0), which is a PEBS capable event, to get the same + * count. + * + * INST_RETIRED.ANY_P counts the number of cycles that retires + * CNTMASK instructions. By setting CNTMASK to a value (16) + * larger than the maximum number of instructions that can be + * retired per cycle (4) and then inverting the condition, we + * count all cycles that retire 16 or less instructions, which + * is every cycle. + * + * Thereby we gain a PEBS capable cycle counter. + */ + u64 alt_config = 0x108000c0; /* INST_RETIRED.TOTAL_CYCLES */ + + alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); + event->hw.config = alt_config; + } + if (event->attr.type != PERF_TYPE_RAW) return 0; diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index d9f4ff8fcd6..d5a23661550 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -16,32 +16,12 @@ #include <linux/kernel.h> #include <linux/bitops.h> #include <linux/smp.h> -#include <linux/nmi.h> +#include <asm/nmi.h> #include <linux/kprobes.h> #include <asm/apic.h> #include <asm/perf_event.h> -struct nmi_watchdog_ctlblk { - unsigned int cccr_msr; - unsigned int perfctr_msr; /* the MSR to reset in NMI handler */ - unsigned int evntsel_msr; /* the MSR to select the events to handle */ -}; - -/* Interface defining a CPU specific perfctr watchdog */ -struct wd_ops { - int (*reserve)(void); - void (*unreserve)(void); - int (*setup)(unsigned nmi_hz); - void (*rearm)(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz); - void (*stop)(void); - unsigned perfctr; - unsigned evntsel; - u64 checkbit; -}; - -static const struct wd_ops *wd_ops; - /* * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's * offset from MSR_P4_BSU_ESCR0. @@ -60,8 +40,6 @@ static const struct wd_ops *wd_ops; static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS); static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS); -static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk); - /* converts an msr to an appropriate reservation bit */ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) { @@ -172,623 +150,3 @@ void release_evntsel_nmi(unsigned int msr) clear_bit(counter, evntsel_nmi_owner); } EXPORT_SYMBOL(release_evntsel_nmi); - -void disable_lapic_nmi_watchdog(void) -{ - BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); - - if (atomic_read(&nmi_active) <= 0) - return; - - on_each_cpu(stop_apic_nmi_watchdog, NULL, 1); - - if (wd_ops) - wd_ops->unreserve(); - - BUG_ON(atomic_read(&nmi_active) != 0); -} - -void enable_lapic_nmi_watchdog(void) -{ - BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); - - /* are we already enabled */ - if (atomic_read(&nmi_active) != 0) - return; - - /* are we lapic aware */ - if (!wd_ops) - return; - if (!wd_ops->reserve()) { - printk(KERN_ERR "NMI watchdog: cannot reserve perfctrs\n"); - return; - } - - on_each_cpu(setup_apic_nmi_watchdog, NULL, 1); - touch_nmi_watchdog(); -} - -/* - * Activate the NMI watchdog via the local APIC. - */ - -static unsigned int adjust_for_32bit_ctr(unsigned int hz) -{ - u64 counter_val; - unsigned int retval = hz; - - /* - * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter - * are writable, with higher bits sign extending from bit 31. - * So, we can only program the counter with 31 bit values and - * 32nd bit should be 1, for 33.. to be 1. - * Find the appropriate nmi_hz - */ - counter_val = (u64)cpu_khz * 1000; - do_div(counter_val, retval); - if (counter_val > 0x7fffffffULL) { - u64 count = (u64)cpu_khz * 1000; - do_div(count, 0x7fffffffUL); - retval = count + 1; - } - return retval; -} - -static void write_watchdog_counter(unsigned int perfctr_msr, - const char *descr, unsigned nmi_hz) -{ - u64 count = (u64)cpu_khz * 1000; - - do_div(count, nmi_hz); - if (descr) - pr_debug("setting %s to -0x%08Lx\n", descr, count); - wrmsrl(perfctr_msr, 0 - count); -} - -static void write_watchdog_counter32(unsigned int perfctr_msr, - const char *descr, unsigned nmi_hz) -{ - u64 count = (u64)cpu_khz * 1000; - - do_div(count, nmi_hz); - if (descr) - pr_debug("setting %s to -0x%08Lx\n", descr, count); - wrmsr(perfctr_msr, (u32)(-count), 0); -} - -/* - * AMD K7/K8/Family10h/Family11h support. - * AMD keeps this interface nicely stable so there is not much variety - */ -#define K7_EVNTSEL_ENABLE (1 << 22) -#define K7_EVNTSEL_INT (1 << 20) -#define K7_EVNTSEL_OS (1 << 17) -#define K7_EVNTSEL_USR (1 << 16) -#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 -#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING - -static int setup_k7_watchdog(unsigned nmi_hz) -{ - unsigned int perfctr_msr, evntsel_msr; - unsigned int evntsel; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - perfctr_msr = wd_ops->perfctr; - evntsel_msr = wd_ops->evntsel; - - wrmsrl(perfctr_msr, 0UL); - - evntsel = K7_EVNTSEL_INT - | K7_EVNTSEL_OS - | K7_EVNTSEL_USR - | K7_NMI_EVENT; - - /* setup the timer */ - wrmsr(evntsel_msr, evntsel, 0); - write_watchdog_counter(perfctr_msr, "K7_PERFCTR0", nmi_hz); - - /* initialize the wd struct before enabling */ - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; /* unused */ - - /* ok, everything is initialized, announce that we're set */ - cpu_nmi_set_wd_enabled(); - - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= K7_EVNTSEL_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); - - return 1; -} - -static void single_msr_stop_watchdog(void) -{ - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - wrmsr(wd->evntsel_msr, 0, 0); -} - -static int single_msr_reserve(void) -{ - if (!reserve_perfctr_nmi(wd_ops->perfctr)) - return 0; - - if (!reserve_evntsel_nmi(wd_ops->evntsel)) { - release_perfctr_nmi(wd_ops->perfctr); - return 0; - } - return 1; -} - -static void single_msr_unreserve(void) -{ - release_evntsel_nmi(wd_ops->evntsel); - release_perfctr_nmi(wd_ops->perfctr); -} - -static void __kprobes -single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) -{ - /* start the cycle over again */ - write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz); -} - -static const struct wd_ops k7_wd_ops = { - .reserve = single_msr_reserve, - .unreserve = single_msr_unreserve, - .setup = setup_k7_watchdog, - .rearm = single_msr_rearm, - .stop = single_msr_stop_watchdog, - .perfctr = MSR_K7_PERFCTR0, - .evntsel = MSR_K7_EVNTSEL0, - .checkbit = 1ULL << 47, -}; - -/* - * Intel Model 6 (PPro+,P2,P3,P-M,Core1) - */ -#define P6_EVNTSEL0_ENABLE (1 << 22) -#define P6_EVNTSEL_INT (1 << 20) -#define P6_EVNTSEL_OS (1 << 17) -#define P6_EVNTSEL_USR (1 << 16) -#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 -#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED - -static int setup_p6_watchdog(unsigned nmi_hz) -{ - unsigned int perfctr_msr, evntsel_msr; - unsigned int evntsel; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - perfctr_msr = wd_ops->perfctr; - evntsel_msr = wd_ops->evntsel; - - /* KVM doesn't implement this MSR */ - if (wrmsr_safe(perfctr_msr, 0, 0) < 0) - return 0; - - evntsel = P6_EVNTSEL_INT - | P6_EVNTSEL_OS - | P6_EVNTSEL_USR - | P6_NMI_EVENT; - - /* setup the timer */ - wrmsr(evntsel_msr, evntsel, 0); - nmi_hz = adjust_for_32bit_ctr(nmi_hz); - write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0", nmi_hz); - - /* initialize the wd struct before enabling */ - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; /* unused */ - - /* ok, everything is initialized, announce that we're set */ - cpu_nmi_set_wd_enabled(); - - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= P6_EVNTSEL0_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); - - return 1; -} - -static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) -{ - /* - * P6 based Pentium M need to re-unmask - * the apic vector but it doesn't hurt - * other P6 variant. - * ArchPerfom/Core Duo also needs this - */ - apic_write(APIC_LVTPC, APIC_DM_NMI); - - /* P6/ARCH_PERFMON has 32 bit counter write */ - write_watchdog_counter32(wd->perfctr_msr, NULL, nmi_hz); -} - -static const struct wd_ops p6_wd_ops = { - .reserve = single_msr_reserve, - .unreserve = single_msr_unreserve, - .setup = setup_p6_watchdog, - .rearm = p6_rearm, - .stop = single_msr_stop_watchdog, - .perfctr = MSR_P6_PERFCTR0, - .evntsel = MSR_P6_EVNTSEL0, - .checkbit = 1ULL << 39, -}; - -/* - * Intel P4 performance counters. - * By far the most complicated of all. - */ -#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1 << 7) -#define P4_ESCR_EVENT_SELECT(N) ((N) << 25) -#define P4_ESCR_OS (1 << 3) -#define P4_ESCR_USR (1 << 2) -#define P4_CCCR_OVF_PMI0 (1 << 26) -#define P4_CCCR_OVF_PMI1 (1 << 27) -#define P4_CCCR_THRESHOLD(N) ((N) << 20) -#define P4_CCCR_COMPLEMENT (1 << 19) -#define P4_CCCR_COMPARE (1 << 18) -#define P4_CCCR_REQUIRED (3 << 16) -#define P4_CCCR_ESCR_SELECT(N) ((N) << 13) -#define P4_CCCR_ENABLE (1 << 12) -#define P4_CCCR_OVF (1 << 31) - -#define P4_CONTROLS 18 -static unsigned int p4_controls[18] = { - MSR_P4_BPU_CCCR0, - MSR_P4_BPU_CCCR1, - MSR_P4_BPU_CCCR2, - MSR_P4_BPU_CCCR3, - MSR_P4_MS_CCCR0, - MSR_P4_MS_CCCR1, - MSR_P4_MS_CCCR2, - MSR_P4_MS_CCCR3, - MSR_P4_FLAME_CCCR0, - MSR_P4_FLAME_CCCR1, - MSR_P4_FLAME_CCCR2, - MSR_P4_FLAME_CCCR3, - MSR_P4_IQ_CCCR0, - MSR_P4_IQ_CCCR1, - MSR_P4_IQ_CCCR2, - MSR_P4_IQ_CCCR3, - MSR_P4_IQ_CCCR4, - MSR_P4_IQ_CCCR5, -}; -/* - * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter - * CRU_ESCR0 (with any non-null event selector) through a complemented - * max threshold. [IA32-Vol3, Section 14.9.9] - */ -static int setup_p4_watchdog(unsigned nmi_hz) -{ - unsigned int perfctr_msr, evntsel_msr, cccr_msr; - unsigned int evntsel, cccr_val; - unsigned int misc_enable, dummy; - unsigned int ht_num; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy); - if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL)) - return 0; - -#ifdef CONFIG_SMP - /* detect which hyperthread we are on */ - if (smp_num_siblings == 2) { - unsigned int ebx, apicid; - - ebx = cpuid_ebx(1); - apicid = (ebx >> 24) & 0xff; - ht_num = apicid & 1; - } else -#endif - ht_num = 0; - - /* - * performance counters are shared resources - * assign each hyperthread its own set - * (re-use the ESCR0 register, seems safe - * and keeps the cccr_val the same) - */ - if (!ht_num) { - /* logical cpu 0 */ - perfctr_msr = MSR_P4_IQ_PERFCTR0; - evntsel_msr = MSR_P4_CRU_ESCR0; - cccr_msr = MSR_P4_IQ_CCCR0; - cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); - - /* - * If we're on the kdump kernel or other situation, we may - * still have other performance counter registers set to - * interrupt and they'll keep interrupting forever because - * of the P4_CCCR_OVF quirk. So we need to ACK all the - * pending interrupts and disable all the registers here, - * before reenabling the NMI delivery. Refer to p4_rearm() - * about the P4_CCCR_OVF quirk. - */ - if (reset_devices) { - unsigned int low, high; - int i; - - for (i = 0; i < P4_CONTROLS; i++) { - rdmsr(p4_controls[i], low, high); - low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF); - wrmsr(p4_controls[i], low, high); - } - } - } else { - /* logical cpu 1 */ - perfctr_msr = MSR_P4_IQ_PERFCTR1; - evntsel_msr = MSR_P4_CRU_ESCR0; - cccr_msr = MSR_P4_IQ_CCCR1; - - /* Pentium 4 D processors don't support P4_CCCR_OVF_PMI1 */ - if (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask == 4) - cccr_val = P4_CCCR_OVF_PMI0; - else - cccr_val = P4_CCCR_OVF_PMI1; - cccr_val |= P4_CCCR_ESCR_SELECT(4); - } - - evntsel = P4_ESCR_EVENT_SELECT(0x3F) - | P4_ESCR_OS - | P4_ESCR_USR; - - cccr_val |= P4_CCCR_THRESHOLD(15) - | P4_CCCR_COMPLEMENT - | P4_CCCR_COMPARE - | P4_CCCR_REQUIRED; - - wrmsr(evntsel_msr, evntsel, 0); - wrmsr(cccr_msr, cccr_val, 0); - write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz); - - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = cccr_msr; - - /* ok, everything is initialized, announce that we're set */ - cpu_nmi_set_wd_enabled(); - - apic_write(APIC_LVTPC, APIC_DM_NMI); - cccr_val |= P4_CCCR_ENABLE; - wrmsr(cccr_msr, cccr_val, 0); - return 1; -} - -static void stop_p4_watchdog(void) -{ - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - wrmsr(wd->cccr_msr, 0, 0); - wrmsr(wd->evntsel_msr, 0, 0); -} - -static int p4_reserve(void) -{ - if (!reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR0)) - return 0; -#ifdef CONFIG_SMP - if (smp_num_siblings > 1 && !reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR1)) - goto fail1; -#endif - if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0)) - goto fail2; - /* RED-PEN why is ESCR1 not reserved here? */ - return 1; - fail2: -#ifdef CONFIG_SMP - if (smp_num_siblings > 1) - release_perfctr_nmi(MSR_P4_IQ_PERFCTR1); - fail1: -#endif - release_perfctr_nmi(MSR_P4_IQ_PERFCTR0); - return 0; -} - -static void p4_unreserve(void) -{ -#ifdef CONFIG_SMP - if (smp_num_siblings > 1) - release_perfctr_nmi(MSR_P4_IQ_PERFCTR1); -#endif - release_evntsel_nmi(MSR_P4_CRU_ESCR0); - release_perfctr_nmi(MSR_P4_IQ_PERFCTR0); -} - -static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) -{ - unsigned dummy; - /* - * P4 quirks: - * - An overflown perfctr will assert its interrupt - * until the OVF flag in its CCCR is cleared. - * - LVTPC is masked on interrupt and must be - * unmasked by the LVTPC handler. - */ - rdmsrl(wd->cccr_msr, dummy); - dummy &= ~P4_CCCR_OVF; - wrmsrl(wd->cccr_msr, dummy); - apic_write(APIC_LVTPC, APIC_DM_NMI); - /* start the cycle over again */ - write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz); -} - -static const struct wd_ops p4_wd_ops = { - .reserve = p4_reserve, - .unreserve = p4_unreserve, - .setup = setup_p4_watchdog, - .rearm = p4_rearm, - .stop = stop_p4_watchdog, - /* RED-PEN this is wrong for the other sibling */ - .perfctr = MSR_P4_BPU_PERFCTR0, - .evntsel = MSR_P4_BSU_ESCR0, - .checkbit = 1ULL << 39, -}; - -/* - * Watchdog using the Intel architected PerfMon. - * Used for Core2 and hopefully all future Intel CPUs. - */ -#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL -#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK - -static struct wd_ops intel_arch_wd_ops; - -static int setup_intel_arch_watchdog(unsigned nmi_hz) -{ - unsigned int ebx; - union cpuid10_eax eax; - unsigned int unused; - unsigned int perfctr_msr, evntsel_msr; - unsigned int evntsel; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - /* - * Check whether the Architectural PerfMon supports - * Unhalted Core Cycles Event or not. - * NOTE: Corresponding bit = 0 in ebx indicates event present. - */ - cpuid(10, &(eax.full), &ebx, &unused, &unused); - if ((eax.split.mask_length < - (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || - (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) - return 0; - - perfctr_msr = wd_ops->perfctr; - evntsel_msr = wd_ops->evntsel; - - wrmsrl(perfctr_msr, 0UL); - - evntsel = ARCH_PERFMON_EVENTSEL_INT - | ARCH_PERFMON_EVENTSEL_OS - | ARCH_PERFMON_EVENTSEL_USR - | ARCH_PERFMON_NMI_EVENT_SEL - | ARCH_PERFMON_NMI_EVENT_UMASK; - - /* setup the timer */ - wrmsr(evntsel_msr, evntsel, 0); - nmi_hz = adjust_for_32bit_ctr(nmi_hz); - write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz); - - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; /* unused */ - - /* ok, everything is initialized, announce that we're set */ - cpu_nmi_set_wd_enabled(); - - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); - intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); - return 1; -} - -static struct wd_ops intel_arch_wd_ops __read_mostly = { - .reserve = single_msr_reserve, - .unreserve = single_msr_unreserve, - .setup = setup_intel_arch_watchdog, - .rearm = p6_rearm, - .stop = single_msr_stop_watchdog, - .perfctr = MSR_ARCH_PERFMON_PERFCTR1, - .evntsel = MSR_ARCH_PERFMON_EVENTSEL1, -}; - -static void probe_nmi_watchdog(void) -{ - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - if (boot_cpu_data.x86 == 6 || - (boot_cpu_data.x86 >= 0xf && boot_cpu_data.x86 <= 0x15)) - wd_ops = &k7_wd_ops; - return; - case X86_VENDOR_INTEL: - /* Work around where perfctr1 doesn't have a working enable - * bit as described in the following errata: - * AE49 Core Duo and Intel Core Solo 65 nm - * AN49 Intel Pentium Dual-Core - * AF49 Dual-Core Intel Xeon Processor LV - */ - if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) || - ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 15 && - boot_cpu_data.x86_mask == 4))) { - intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0; - intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0; - } - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - wd_ops = &intel_arch_wd_ops; - break; - } - switch (boot_cpu_data.x86) { - case 6: - if (boot_cpu_data.x86_model > 13) - return; - - wd_ops = &p6_wd_ops; - break; - case 15: - wd_ops = &p4_wd_ops; - break; - default: - return; - } - break; - } -} - -/* Interface to nmi.c */ - -int lapic_watchdog_init(unsigned nmi_hz) -{ - if (!wd_ops) { - probe_nmi_watchdog(); - if (!wd_ops) { - printk(KERN_INFO "NMI watchdog: CPU not supported\n"); - return -1; - } - - if (!wd_ops->reserve()) { - printk(KERN_ERR - "NMI watchdog: cannot reserve perfctrs\n"); - return -1; - } - } - - if (!(wd_ops->setup(nmi_hz))) { - printk(KERN_ERR "Cannot setup NMI watchdog on CPU %d\n", - raw_smp_processor_id()); - return -1; - } - - return 0; -} - -void lapic_watchdog_stop(void) -{ - if (wd_ops) - wd_ops->stop(); -} - -unsigned lapic_adjust_nmi_hz(unsigned hz) -{ - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - if (wd->perfctr_msr == MSR_P6_PERFCTR0 || - wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1) - hz = adjust_for_32bit_ctr(hz); - return hz; -} - -int __kprobes lapic_wd_event(unsigned nmi_hz) -{ - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - u64 ctr; - - rdmsrl(wd->perfctr_msr, ctr); - if (ctr & wd_ops->checkbit) /* perfctr still running? */ - return 0; - - wd_ops->rearm(wd, nmi_hz); - return 1; -} diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 6e8752c1bd5..8474c998cbd 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -175,21 +175,21 @@ static const struct stacktrace_ops print_trace_ops = { void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl) + unsigned long *stack, char *log_lvl) { printk("%sCall Trace:\n", log_lvl); - dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); + dump_trace(task, regs, stack, &print_trace_ops, log_lvl); } void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp) + unsigned long *stack) { - show_trace_log_lvl(task, regs, stack, bp, ""); + show_trace_log_lvl(task, regs, stack, ""); } void show_stack(struct task_struct *task, unsigned long *sp) { - show_stack_log_lvl(task, NULL, sp, 0, ""); + show_stack_log_lvl(task, NULL, sp, ""); } /* @@ -210,7 +210,7 @@ void dump_stack(void) init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - show_trace(NULL, NULL, &stack, bp); + show_trace(NULL, NULL, &stack); } EXPORT_SYMBOL(dump_stack); diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 1bc7f75a5bd..74cc1eda384 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -17,11 +17,12 @@ #include <asm/stacktrace.h> -void dump_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, +void dump_trace(struct task_struct *task, + struct pt_regs *regs, unsigned long *stack, const struct stacktrace_ops *ops, void *data) { int graph = 0; + unsigned long bp; if (!task) task = current; @@ -34,18 +35,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, stack = (unsigned long *)task->thread.sp; } -#ifdef CONFIG_FRAME_POINTER - if (!bp) { - if (task == current) { - /* Grab bp right from our regs */ - get_bp(bp); - } else { - /* bp is the last reg pushed by switch_to */ - bp = *(unsigned long *) task->thread.sp; - } - } -#endif - + bp = stack_frame(task, regs); for (;;) { struct thread_info *context; @@ -65,7 +55,7 @@ EXPORT_SYMBOL(dump_trace); void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) + unsigned long *sp, char *log_lvl) { unsigned long *stack; int i; @@ -87,7 +77,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, touch_nmi_watchdog(); } printk(KERN_CONT "\n"); - show_trace_log_lvl(task, regs, sp, bp, log_lvl); + show_trace_log_lvl(task, regs, sp, log_lvl); } @@ -112,8 +102,7 @@ void show_registers(struct pt_regs *regs) u8 *ip; printk(KERN_EMERG "Stack:\n"); - show_stack_log_lvl(NULL, regs, ®s->sp, - 0, KERN_EMERG); + show_stack_log_lvl(NULL, regs, ®s->sp, KERN_EMERG); printk(KERN_EMERG "Code: "); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 6a340485249..64101335de1 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -139,8 +139,8 @@ fixup_bp_irq_link(unsigned long bp, unsigned long *stack, * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack */ -void dump_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, +void dump_trace(struct task_struct *task, + struct pt_regs *regs, unsigned long *stack, const struct stacktrace_ops *ops, void *data) { const unsigned cpu = get_cpu(); @@ -149,6 +149,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned used = 0; struct thread_info *tinfo; int graph = 0; + unsigned long bp; if (!task) task = current; @@ -160,18 +161,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, stack = (unsigned long *)task->thread.sp; } -#ifdef CONFIG_FRAME_POINTER - if (!bp) { - if (task == current) { - /* Grab bp right from our regs */ - get_bp(bp); - } else { - /* bp is the last reg pushed by switch_to */ - bp = *(unsigned long *) task->thread.sp; - } - } -#endif - + bp = stack_frame(task, regs); /* * Print function call entries in all stacks, starting at the * current stack address. If the stacks consist of nested @@ -235,7 +225,7 @@ EXPORT_SYMBOL(dump_trace); void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) + unsigned long *sp, char *log_lvl) { unsigned long *irq_stack_end; unsigned long *irq_stack; @@ -279,7 +269,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, preempt_enable(); printk(KERN_CONT "\n"); - show_trace_log_lvl(task, regs, sp, bp, log_lvl); + show_trace_log_lvl(task, regs, sp, log_lvl); } void show_registers(struct pt_regs *regs) @@ -308,7 +298,7 @@ void show_registers(struct pt_regs *regs) printk(KERN_EMERG "Stack:\n"); show_stack_log_lvl(NULL, regs, (unsigned long *)sp, - regs->bp, KERN_EMERG); + KERN_EMERG); printk(KERN_EMERG "Code: "); diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 1cbd54c0df9..5940282bd2f 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -1184,6 +1184,10 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op, { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + /* This is possible if op is under delayed unoptimizing */ + if (kprobe_disabled(&op->kp)) + return; + preempt_disable(); if (kprobe_running()) { kprobes_inc_nmissed_count(&op->kp); @@ -1401,10 +1405,16 @@ int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op) return 0; } -/* Replace a breakpoint (int3) with a relative jump. */ -int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op) +#define MAX_OPTIMIZE_PROBES 256 +static struct text_poke_param *jump_poke_params; +static struct jump_poke_buffer { + u8 buf[RELATIVEJUMP_SIZE]; +} *jump_poke_bufs; + +static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm, + u8 *insn_buf, + struct optimized_kprobe *op) { - unsigned char jmp_code[RELATIVEJUMP_SIZE]; s32 rel = (s32)((long)op->optinsn.insn - ((long)op->kp.addr + RELATIVEJUMP_SIZE)); @@ -1412,16 +1422,79 @@ int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op) memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, RELATIVE_ADDR_SIZE); - jmp_code[0] = RELATIVEJUMP_OPCODE; - *(s32 *)(&jmp_code[1]) = rel; + insn_buf[0] = RELATIVEJUMP_OPCODE; + *(s32 *)(&insn_buf[1]) = rel; + + tprm->addr = op->kp.addr; + tprm->opcode = insn_buf; + tprm->len = RELATIVEJUMP_SIZE; +} + +/* + * Replace breakpoints (int3) with relative jumps. + * Caller must call with locking kprobe_mutex and text_mutex. + */ +void __kprobes arch_optimize_kprobes(struct list_head *oplist) +{ + struct optimized_kprobe *op, *tmp; + int c = 0; + + list_for_each_entry_safe(op, tmp, oplist, list) { + WARN_ON(kprobe_disabled(&op->kp)); + /* Setup param */ + setup_optimize_kprobe(&jump_poke_params[c], + jump_poke_bufs[c].buf, op); + list_del_init(&op->list); + if (++c >= MAX_OPTIMIZE_PROBES) + break; + } /* * text_poke_smp doesn't support NMI/MCE code modifying. * However, since kprobes itself also doesn't support NMI/MCE * code probing, it's not a problem. */ - text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE); - return 0; + text_poke_smp_batch(jump_poke_params, c); +} + +static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm, + u8 *insn_buf, + struct optimized_kprobe *op) +{ + /* Set int3 to first byte for kprobes */ + insn_buf[0] = BREAKPOINT_INSTRUCTION; + memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); + + tprm->addr = op->kp.addr; + tprm->opcode = insn_buf; + tprm->len = RELATIVEJUMP_SIZE; +} + +/* + * Recover original instructions and breakpoints from relative jumps. + * Caller must call with locking kprobe_mutex. + */ +extern void arch_unoptimize_kprobes(struct list_head *oplist, + struct list_head *done_list) +{ + struct optimized_kprobe *op, *tmp; + int c = 0; + + list_for_each_entry_safe(op, tmp, oplist, list) { + /* Setup param */ + setup_unoptimize_kprobe(&jump_poke_params[c], + jump_poke_bufs[c].buf, op); + list_move(&op->list, done_list); + if (++c >= MAX_OPTIMIZE_PROBES) + break; + } + + /* + * text_poke_smp doesn't support NMI/MCE code modifying. + * However, since kprobes itself also doesn't support NMI/MCE + * code probing, it's not a problem. + */ + text_poke_smp_batch(jump_poke_params, c); } /* Replace a relative jump with a breakpoint (int3). */ @@ -1453,11 +1526,35 @@ static int __kprobes setup_detour_execution(struct kprobe *p, } return 0; } + +static int __kprobes init_poke_params(void) +{ + /* Allocate code buffer and parameter array */ + jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) * + MAX_OPTIMIZE_PROBES, GFP_KERNEL); + if (!jump_poke_bufs) + return -ENOMEM; + + jump_poke_params = kmalloc(sizeof(struct text_poke_param) * + MAX_OPTIMIZE_PROBES, GFP_KERNEL); + if (!jump_poke_params) { + kfree(jump_poke_bufs); + jump_poke_bufs = NULL; + return -ENOMEM; + } + + return 0; +} +#else /* !CONFIG_OPTPROBES */ +static int __kprobes init_poke_params(void) +{ + return 0; +} #endif int __init arch_init_kprobes(void) { - return 0; + return init_poke_params(); } int __kprobes arch_trampoline_kprobe(struct kprobe *p) diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index ba0f0ca9f28..c01ffa5b9b8 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -143,7 +143,7 @@ static void flush_gart(void) spin_lock_irqsave(&iommu_bitmap_lock, flags); if (need_flush) { - k8_flush_garts(); + amd_flush_garts(); need_flush = false; } spin_unlock_irqrestore(&iommu_bitmap_lock, flags); @@ -561,17 +561,17 @@ static void enable_gart_translations(void) { int i; - if (!k8_northbridges.gart_supported) + if (!amd_nb_has_feature(AMD_NB_GART)) return; - for (i = 0; i < k8_northbridges.num; i++) { - struct pci_dev *dev = k8_northbridges.nb_misc[i]; + for (i = 0; i < amd_nb_num(); i++) { + struct pci_dev *dev = node_to_amd_nb(i)->misc; enable_gart_translation(dev, __pa(agp_gatt_table)); } /* Flush the GART-TLB to remove stale entries */ - k8_flush_garts(); + amd_flush_garts(); } /* @@ -596,13 +596,13 @@ static void gart_fixup_northbridges(struct sys_device *dev) if (!fix_up_north_bridges) return; - if (!k8_northbridges.gart_supported) + if (!amd_nb_has_feature(AMD_NB_GART)) return; pr_info("PCI-DMA: Restoring GART aperture settings\n"); - for (i = 0; i < k8_northbridges.num; i++) { - struct pci_dev *dev = k8_northbridges.nb_misc[i]; + for (i = 0; i < amd_nb_num(); i++) { + struct pci_dev *dev = node_to_amd_nb(i)->misc; /* * Don't enable translations just yet. That is the next @@ -644,7 +644,7 @@ static struct sys_device device_gart = { * Private Northbridge GATT initialization in case we cannot use the * AGP driver for some reason. */ -static __init int init_k8_gatt(struct agp_kern_info *info) +static __init int init_amd_gatt(struct agp_kern_info *info) { unsigned aper_size, gatt_size, new_aper_size; unsigned aper_base, new_aper_base; @@ -656,8 +656,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info) aper_size = aper_base = info->aper_size = 0; dev = NULL; - for (i = 0; i < k8_northbridges.num; i++) { - dev = k8_northbridges.nb_misc[i]; + for (i = 0; i < amd_nb_num(); i++) { + dev = node_to_amd_nb(i)->misc; new_aper_base = read_aperture(dev, &new_aper_size); if (!new_aper_base) goto nommu; @@ -725,13 +725,13 @@ static void gart_iommu_shutdown(void) if (!no_agp) return; - if (!k8_northbridges.gart_supported) + if (!amd_nb_has_feature(AMD_NB_GART)) return; - for (i = 0; i < k8_northbridges.num; i++) { + for (i = 0; i < amd_nb_num(); i++) { u32 ctl; - dev = k8_northbridges.nb_misc[i]; + dev = node_to_amd_nb(i)->misc; pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); ctl &= ~GARTEN; @@ -749,14 +749,14 @@ int __init gart_iommu_init(void) unsigned long scratch; long i; - if (!k8_northbridges.gart_supported) + if (!amd_nb_has_feature(AMD_NB_GART)) return 0; #ifndef CONFIG_AGP_AMD64 no_agp = 1; #else /* Makefile puts PCI initialization via subsys_initcall first. */ - /* Add other K8 AGP bridge drivers here */ + /* Add other AMD AGP bridge drivers here */ no_agp = no_agp || (agp_amd64_init() < 0) || (agp_copy_info(agp_bridge, &info) < 0); @@ -765,7 +765,7 @@ int __init gart_iommu_init(void) if (no_iommu || (!force_iommu && max_pfn <= MAX_DMA32_PFN) || !gart_iommu_aperture || - (no_agp && init_k8_gatt(&info) < 0)) { + (no_agp && init_amd_gatt(&info) < 0)) { if (max_pfn > MAX_DMA32_PFN) { pr_warning("More than 4GB of memory but GART IOMMU not available.\n"); pr_warning("falling back to iommu=soft.\n"); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 57d1868a86a..c852041bfc3 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -91,8 +91,7 @@ void exit_thread(void) void show_regs(struct pt_regs *regs) { show_registers(regs); - show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), - regs->bp); + show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs)); } void show_regs_common(void) @@ -374,6 +373,7 @@ void default_idle(void) { if (hlt_use_halt()) { trace_power_start(POWER_CSTATE, 1, smp_processor_id()); + trace_cpu_idle(1, smp_processor_id()); current_thread_info()->status &= ~TS_POLLING; /* * TS_POLLING-cleared state must be visible before we @@ -444,6 +444,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id()); + trace_cpu_idle((ax>>4)+1, smp_processor_id()); if (!need_resched()) { if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -460,6 +461,7 @@ static void mwait_idle(void) { if (!need_resched()) { trace_power_start(POWER_CSTATE, 1, smp_processor_id()); + trace_cpu_idle(1, smp_processor_id()); if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -481,10 +483,12 @@ static void mwait_idle(void) static void poll_idle(void) { trace_power_start(POWER_CSTATE, 0, smp_processor_id()); + trace_cpu_idle(0, smp_processor_id()); local_irq_enable(); while (!need_resched()) cpu_relax(); - trace_power_end(0); + trace_power_end(smp_processor_id()); + trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); } /* diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 96586c3cbbb..4b9befa0e34 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -113,8 +113,8 @@ void cpu_idle(void) stop_critical_timings(); pm_idle(); start_critical_timings(); - trace_power_end(smp_processor_id()); + trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); } tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b3d7a3a04f3..4c818a73839 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -142,6 +142,8 @@ void cpu_idle(void) start_critical_timings(); trace_power_end(smp_processor_id()); + trace_cpu_idle(PWR_EVENT_EXIT, + smp_processor_id()); /* In many cases the interrupt that ended idle has already called exit_idle. But some idle diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index a25ce8864f1..d3cfe26c025 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -705,7 +705,7 @@ static u64 __init get_max_mapped(void) void __init setup_arch(char **cmdline_p) { int acpi = 0; - int k8 = 0; + int amd = 0; unsigned long flags; #ifdef CONFIG_X86_32 @@ -991,12 +991,12 @@ void __init setup_arch(char **cmdline_p) acpi = acpi_numa_init(); #endif -#ifdef CONFIG_K8_NUMA +#ifdef CONFIG_AMD_NUMA if (!acpi) - k8 = !k8_numa_init(0, max_pfn); + amd = !amd_numa_init(0, max_pfn); #endif - initmem_init(0, max_pfn, acpi, k8); + initmem_init(0, max_pfn, acpi, amd); memblock_find_dma_reserve(); dma32_reserve_bootmem(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 083e99d1b7d..68f61ac632e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -281,6 +281,13 @@ static void __cpuinit smp_callin(void) */ smp_store_cpu_info(cpuid); + /* + * This must be done before setting cpu_online_mask + * or calling notify_cpu_starting. + */ + set_cpu_sibling_map(raw_smp_processor_id()); + wmb(); + notify_cpu_starting(cpuid); /* @@ -316,16 +323,6 @@ notrace static void __cpuinit start_secondary(void *unused) */ check_tsc_sync_target(); - if (nmi_watchdog == NMI_IO_APIC) { - legacy_pic->mask(0); - enable_NMI_through_LVT0(); - legacy_pic->unmask(0); - } - - /* This must be done before setting cpu_online_mask */ - set_cpu_sibling_map(raw_smp_processor_id()); - wmb(); - /* * We need to hold call_lock, so there is no inconsistency * between the time smp_call_function() determines number of @@ -1061,8 +1058,6 @@ static int __init smp_sanity_check(unsigned max_cpus) printk(KERN_INFO "SMP mode deactivated.\n"); smpboot_clear_io_apic(); - localise_nmi_watchdog(); - connect_bsp_APIC(); setup_local_APIC(); end_local_APIC_setup(); @@ -1196,7 +1191,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus) #ifdef CONFIG_X86_IO_APIC setup_ioapic_dest(); #endif - check_nmi_watchdog(); mtrr_aps_init(); } @@ -1341,8 +1335,6 @@ int native_cpu_disable(void) if (cpu == 0) return -EBUSY; - if (nmi_watchdog == NMI_LOCAL_APIC) - stop_apic_nmi_watchdog(NULL); clear_local_APIC(); cpu_disable_common(); diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index b53c525368a..938c8e10a19 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -73,22 +73,22 @@ static const struct stacktrace_ops save_stack_ops_nosched = { */ void save_stack_trace(struct stack_trace *trace) { - dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace); + dump_trace(current, NULL, NULL, &save_stack_ops, trace); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL_GPL(save_stack_trace); -void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp) +void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs) { - dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace); + dump_trace(current, regs, NULL, &save_stack_ops, trace); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { - dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); + dump_trace(tsk, NULL, NULL, &save_stack_ops_nosched, trace); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index fb5cc5e14cf..25a28a24593 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -22,10 +22,6 @@ #include <asm/hpet.h> #include <asm/time.h> -#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) -int timer_ack; -#endif - #ifdef CONFIG_X86_64 volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; #endif @@ -63,20 +59,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) /* Keep nmi watchdog up to date */ inc_irq_stat(irq0_irqs); - /* Optimized out for !IO_APIC and x86_64 */ - if (timer_ack) { - /* - * Subtle, when I/O APICs are used we have to ack timer IRQ - * manually to deassert NMI lines for the watchdog if run - * on an 82489DX-based system. - */ - raw_spin_lock(&i8259A_lock); - outb(0x0c, PIC_MASTER_OCW3); - /* Ack the IRQ; AEOI will end it automatically. */ - inb(PIC_MASTER_POLL); - raw_spin_unlock(&i8259A_lock); - } - global_clock_event->event_handler(global_clock_event); /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index cb838ca42c9..c76aaca5694 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -83,6 +83,8 @@ EXPORT_SYMBOL_GPL(used_vectors); static int ignore_nmis; +int unknown_nmi_panic; + static inline void conditional_sti(struct pt_regs *regs) { if (regs->flags & X86_EFLAGS_IF) @@ -300,6 +302,13 @@ gp_in_kernel: die("general protection fault", regs, error_code); } +static int __init setup_unknown_nmi_panic(char *str) +{ + unknown_nmi_panic = 1; + return 1; +} +__setup("unknown_nmi_panic", setup_unknown_nmi_panic); + static notrace __kprobes void mem_parity_error(unsigned char reason, struct pt_regs *regs) { @@ -342,9 +351,11 @@ io_check_error(unsigned char reason, struct pt_regs *regs) reason = (reason & 0xf) | 8; outb(reason, 0x61); - i = 2000; - while (--i) - udelay(1000); + i = 20000; + while (--i) { + touch_nmi_watchdog(); + udelay(100); + } reason &= ~8; outb(reason, 0x61); @@ -371,7 +382,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) reason, smp_processor_id()); printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); - if (panic_on_unrecovered_nmi) + if (unknown_nmi_panic || panic_on_unrecovered_nmi) panic("NMI: Not continuing"); printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); @@ -397,20 +408,8 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) return; - -#ifndef CONFIG_LOCKUP_DETECTOR - /* - * Ok, so this is none of the documented NMI sources, - * so it must be the NMI watchdog. - */ - if (nmi_watchdog_tick(regs, reason)) - return; - if (!do_nmi_callback(regs, cpu)) -#endif /* !CONFIG_LOCKUP_DETECTOR */ - unknown_nmi_error(reason, regs); -#else - unknown_nmi_error(reason, regs); #endif + unknown_nmi_error(reason, regs); return; } @@ -446,14 +445,12 @@ do_nmi(struct pt_regs *regs, long error_code) void stop_nmi(void) { - acpi_nmi_disable(); ignore_nmis++; } void restart_nmi(void) { ignore_nmis--; - acpi_nmi_enable(); } /* May run on IST stack. */ diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index f628234fbec..3cece05e4ac 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -575,6 +575,8 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) s->pics[1].elcr_mask = 0xde; s->pics[0].pics_state = s; s->pics[1].pics_state = s; + s->pics[0].isr_ack = 0xff; + s->pics[1].isr_ack = 0xff; /* * Initialize PIO device diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index fb8b376bf28..fbb04aee830 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2394,7 +2394,8 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) ASSERT(!VALID_PAGE(root)); spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); - sp = kvm_mmu_get_page(vcpu, i << 30, i << 30, + sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), + i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL, NULL); root = __pa(sp->spt); diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 55543397a8a..09df2f9a3d6 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -23,7 +23,7 @@ mmiotrace-y := kmmio.o pf_in.o mmio-mod.o obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o -obj-$(CONFIG_K8_NUMA) += k8topology_64.o +obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/amdtopology_64.c index 804a3b6c6e1..51fae9cfdec 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/amdtopology_64.c @@ -1,8 +1,8 @@ /* - * AMD K8 NUMA support. + * AMD NUMA support. * Discover the memory map and associated nodes. * - * This version reads it directly from the K8 northbridge. + * This version reads it directly from the AMD northbridge. * * Copyright 2002,2003 Andi Kleen, SuSE Labs. */ @@ -57,7 +57,7 @@ static __init void early_get_boot_cpu_id(void) { /* * need to get the APIC ID of the BSP so can use that to - * create apicid_to_node in k8_scan_nodes() + * create apicid_to_node in amd_scan_nodes() */ #ifdef CONFIG_X86_MPPARSE /* @@ -69,7 +69,7 @@ static __init void early_get_boot_cpu_id(void) early_init_lapic_mapping(); } -int __init k8_get_nodes(struct bootnode *physnodes) +int __init amd_get_nodes(struct bootnode *physnodes) { int i; int ret = 0; @@ -82,7 +82,7 @@ int __init k8_get_nodes(struct bootnode *physnodes) return ret; } -int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn) +int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) { unsigned long start = PFN_PHYS(start_pfn); unsigned long end = PFN_PHYS(end_pfn); @@ -194,7 +194,7 @@ int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn) return 0; } -int __init k8_scan_nodes(void) +int __init amd_scan_nodes(void) { unsigned int bits; unsigned int cores; diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c index af3b6c8a436..704a37ceddd 100644 --- a/arch/x86/mm/kmemcheck/error.c +++ b/arch/x86/mm/kmemcheck/error.c @@ -185,7 +185,7 @@ void kmemcheck_error_save(enum kmemcheck_shadow state, e->trace.entries = e->trace_entries; e->trace.max_entries = ARRAY_SIZE(e->trace_entries); e->trace.skip = 0; - save_stack_trace_bp(&e->trace, regs->bp); + save_stack_trace_regs(&e->trace, regs); /* Round address down to nearest 16 bytes */ shadow_copy = kmemcheck_shadow_lookup(address diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 7ffc9b727ef..7762a517d69 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -264,7 +264,7 @@ static struct bootnode physnodes[MAX_NUMNODES] __initdata; static char *cmdline __initdata; static int __init setup_physnodes(unsigned long start, unsigned long end, - int acpi, int k8) + int acpi, int amd) { int nr_nodes = 0; int ret = 0; @@ -274,13 +274,13 @@ static int __init setup_physnodes(unsigned long start, unsigned long end, if (acpi) nr_nodes = acpi_get_nodes(physnodes); #endif -#ifdef CONFIG_K8_NUMA - if (k8) - nr_nodes = k8_get_nodes(physnodes); +#ifdef CONFIG_AMD_NUMA + if (amd) + nr_nodes = amd_get_nodes(physnodes); #endif /* * Basic sanity checking on the physical node map: there may be errors - * if the SRAT or K8 incorrectly reported the topology or the mem= + * if the SRAT or AMD code incorrectly reported the topology or the mem= * kernel parameter is used. */ for (i = 0; i < nr_nodes; i++) { @@ -549,7 +549,7 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) * numa=fake command-line option. */ static int __init numa_emulation(unsigned long start_pfn, - unsigned long last_pfn, int acpi, int k8) + unsigned long last_pfn, int acpi, int amd) { u64 addr = start_pfn << PAGE_SHIFT; u64 max_addr = last_pfn << PAGE_SHIFT; @@ -557,7 +557,7 @@ static int __init numa_emulation(unsigned long start_pfn, int num_nodes; int i; - num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); + num_phys_nodes = setup_physnodes(addr, max_addr, acpi, amd); /* * If the numa=fake command-line contains a 'M' or 'G', it represents * the fixed node size. Otherwise, if it is just a single number N, @@ -602,7 +602,7 @@ static int __init numa_emulation(unsigned long start_pfn, #endif /* CONFIG_NUMA_EMU */ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, - int acpi, int k8) + int acpi, int amd) { int i; @@ -610,7 +610,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, nodes_clear(node_online_map); #ifdef CONFIG_NUMA_EMU - if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8)) + if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd)) return; nodes_clear(node_possible_map); nodes_clear(node_online_map); @@ -624,8 +624,8 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, nodes_clear(node_online_map); #endif -#ifdef CONFIG_K8_NUMA - if (!numa_off && k8 && !k8_scan_nodes()) +#ifdef CONFIG_AMD_NUMA + if (!numa_off && amd && !amd_scan_nodes()) return; nodes_clear(node_possible_map); nodes_clear(node_online_map); diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index 2d49d4e19a3..72cbec14d78 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -126,7 +126,7 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth) if (!user_mode_vm(regs)) { unsigned long stack = kernel_stack_pointer(regs); if (depth) - dump_trace(NULL, regs, (unsigned long *)stack, 0, + dump_trace(NULL, regs, (unsigned long *)stack, &backtrace_ops, &depth); return; } diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 4e8baad36d3..358c8b9c96a 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -732,6 +732,9 @@ int __init op_nmi_init(struct oprofile_operations *ops) case 0x14: cpu_type = "x86-64/family14h"; break; + case 0x15: + cpu_type = "x86-64/family15h"; + break; default: return -ENODEV; } diff --git a/arch/x86/oprofile/nmi_timer_int.c b/arch/x86/oprofile/nmi_timer_int.c index e3ecb71b579..0636dd93cef 100644 --- a/arch/x86/oprofile/nmi_timer_int.c +++ b/arch/x86/oprofile/nmi_timer_int.c @@ -58,9 +58,6 @@ static void timer_stop(void) int __init op_nmi_timer_init(struct oprofile_operations *ops) { - if ((nmi_watchdog != NMI_IO_APIC) || (atomic_read(&nmi_active) <= 0)) - return -ENODEV; - ops->start = timer_start; ops->stop = timer_stop; ops->cpu_type = "timer"; diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index a011bcc0f94..51104b33fd5 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -29,11 +29,12 @@ #include "op_x86_model.h" #include "op_counter.h" -#define NUM_COUNTERS 4 +#define NUM_COUNTERS 4 +#define NUM_COUNTERS_F15H 6 #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX -#define NUM_VIRT_COUNTERS 32 +#define NUM_VIRT_COUNTERS 32 #else -#define NUM_VIRT_COUNTERS NUM_COUNTERS +#define NUM_VIRT_COUNTERS 0 #endif #define OP_EVENT_MASK 0x0FFF @@ -41,7 +42,8 @@ #define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21)) -static unsigned long reset_value[NUM_VIRT_COUNTERS]; +static int num_counters; +static unsigned long reset_value[OP_MAX_COUNTER]; #define IBS_FETCH_SIZE 6 #define IBS_OP_SIZE 12 @@ -387,7 +389,7 @@ static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, int i; /* enable active counters */ - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { int virt = op_x86_phys_to_virt(i); if (!reset_value[virt]) continue; @@ -406,7 +408,7 @@ static void op_amd_shutdown(struct op_msrs const * const msrs) { int i; - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { if (!msrs->counters[i].addr) continue; release_perfctr_nmi(MSR_K7_PERFCTR0 + i); @@ -418,7 +420,7 @@ static int op_amd_fill_in_addresses(struct op_msrs * const msrs) { int i; - for (i = 0; i < NUM_COUNTERS; i++) { + for (i = 0; i < num_counters; i++) { if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) goto fail; if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) { @@ -426,8 +428,13 @@ static int op_amd_fill_in_addresses(struct op_msrs * const msrs) goto fail; } /* both registers must be reserved */ - msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; - msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; + if (num_counters == NUM_COUNTERS_F15H) { + msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1); + msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1); + } else { + msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; + msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; + } continue; fail: if (!counter_config[i].enabled) @@ -447,7 +454,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, int i; /* setup reset_value */ - for (i = 0; i < NUM_VIRT_COUNTERS; ++i) { + for (i = 0; i < OP_MAX_COUNTER; ++i) { if (counter_config[i].enabled && msrs->counters[op_x86_virt_to_phys(i)].addr) reset_value[i] = counter_config[i].count; @@ -456,7 +463,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, } /* clear all counters */ - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { if (!msrs->controls[i].addr) continue; rdmsrl(msrs->controls[i].addr, val); @@ -472,7 +479,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, } /* enable active counters */ - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { int virt = op_x86_phys_to_virt(i); if (!reset_value[virt]) continue; @@ -503,7 +510,7 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, u64 val; int i; - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { int virt = op_x86_phys_to_virt(i); if (!reset_value[virt]) continue; @@ -526,7 +533,7 @@ static void op_amd_start(struct op_msrs const * const msrs) u64 val; int i; - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { if (!reset_value[op_x86_phys_to_virt(i)]) continue; rdmsrl(msrs->controls[i].addr, val); @@ -546,7 +553,7 @@ static void op_amd_stop(struct op_msrs const * const msrs) * Subtle: stop on all counters to avoid race with setting our * pm callback */ - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { if (!reset_value[op_x86_phys_to_virt(i)]) continue; rdmsrl(msrs->controls[i].addr, val); @@ -630,21 +637,29 @@ static int __init_ibs_nmi(void) return 0; } -/* initialize the APIC for the IBS interrupts if available */ +/* + * check and reserve APIC extended interrupt LVT offset for IBS if + * available + * + * init_ibs() preforms implicitly cpu-local operations, so pin this + * thread to its current CPU + */ + static void init_ibs(void) { - ibs_caps = get_ibs_caps(); + preempt_disable(); + ibs_caps = get_ibs_caps(); if (!ibs_caps) - return; + goto out; - if (__init_ibs_nmi()) { + if (__init_ibs_nmi() < 0) ibs_caps = 0; - return; - } + else + printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps); - printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", - (unsigned)ibs_caps); +out: + preempt_enable(); } static int (*create_arch_files)(struct super_block *sb, struct dentry *root); @@ -698,18 +713,29 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root) return 0; } +struct op_x86_model_spec op_amd_spec; + static int op_amd_init(struct oprofile_operations *ops) { init_ibs(); create_arch_files = ops->create_files; ops->create_files = setup_ibs_files; + + if (boot_cpu_data.x86 == 0x15) { + num_counters = NUM_COUNTERS_F15H; + } else { + num_counters = NUM_COUNTERS; + } + + op_amd_spec.num_counters = num_counters; + op_amd_spec.num_controls = num_counters; + op_amd_spec.num_virt_counters = max(num_counters, NUM_VIRT_COUNTERS); + return 0; } struct op_x86_model_spec op_amd_spec = { - .num_counters = NUM_COUNTERS, - .num_controls = NUM_COUNTERS, - .num_virt_counters = NUM_VIRT_COUNTERS, + /* num_counters/num_controls filled in at runtime */ .reserved = MSR_AMD_EVENTSEL_RESERVED, .event_mask = OP_EVENT_MASK, .init = op_amd_init, diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 182558dd551..9fadec07414 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -11,7 +11,7 @@ #include <linux/oprofile.h> #include <linux/smp.h> #include <linux/ptrace.h> -#include <linux/nmi.h> +#include <asm/nmi.h> #include <asm/msr.h> #include <asm/fixmap.h> #include <asm/apic.h> |