From a9322f6488b432ddc1e89be88242c827c633fb63 Mon Sep 17 00:00:00 2001 From: Stefan Assmann Date: Wed, 11 Jun 2008 16:35:14 +0200 Subject: x86, pci: introduce pci=noioapicquirk kernel cmdline option Introduce pci=noioapicquirk kernel cmdline option to disable all boot interrupt quirks Signed-off-by: Stefan Assmann Signed-off-by: Olaf Dabrunz Signed-off-by: Ingo Molnar --- arch/x86/pci/common.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 940185ecaed..bc6a101ed7e 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -22,6 +22,7 @@ unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 | static int pci_bf_sort; int pci_routeirq; +int noioapicquirk; int pcibios_last_bus = -1; unsigned long pirq_table_addr; struct pci_bus *pci_root_bus; @@ -495,6 +496,9 @@ char * __devinit pcibios_setup(char *str) } else if (!strcmp(str, "skip_isa_align")) { pci_probe |= PCI_CAN_SKIP_ISA_ALIGN; return NULL; + } else if (!strcmp(str, "noioapicquirk")) { + noioapicquirk = 1; + return NULL; } return str; } -- cgit v1.2.3-70-g09d2 From 9197979b518573999d52d9e85bce1680682ed85c Mon Sep 17 00:00:00 2001 From: Stefan Assmann Date: Wed, 11 Jun 2008 16:35:15 +0200 Subject: x86, pci: introduce pci=ioapicreroute kernel cmdline option Introduce pci=ioapicreroute kernel cmdline option to enable rerouting of boot interrupts to the primary io-apic. Signed-off-by: Stefan Assmann Signed-off-by: Olaf Dabrunz Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 4 ++++ arch/x86/pci/common.c | 5 +++++ include/asm-x86/io_apic.h | 4 ++++ include/asm-x86/pci.h | 1 + 4 files changed, 14 insertions(+) (limited to 'arch/x86') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 1aebe9dffba..df262b3c3d6 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1521,6 +1521,10 @@ and is between 256 and 4096 characters. It is defined in the file noioapicquirk [APIC] Disable all boot interrupt quirks. Safety option to keep boot IRQs enabled. This should never be necessary. + ioapicreroute [APIC] Enable rerouting of boot IRQs to the + primary IO-APIC for bridges that cannot disable + boot IRQs. This fixes a source of spurious IRQs + when the system masks IRQs. biosirq [X86-32] Use PCI BIOS calls to get the interrupt routing table. These calls are known to be buggy on several machines and they hang the machine diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index bc6a101ed7e..0a9eaa736d9 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -23,6 +23,7 @@ unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 | static int pci_bf_sort; int pci_routeirq; int noioapicquirk; +int noioapicreroute = 1; int pcibios_last_bus = -1; unsigned long pirq_table_addr; struct pci_bus *pci_root_bus; @@ -499,6 +500,10 @@ char * __devinit pcibios_setup(char *str) } else if (!strcmp(str, "noioapicquirk")) { noioapicquirk = 1; return NULL; + } else if (!strcmp(str, "ioapicreroute")) { + if (noioapicreroute != -1) + noioapicreroute = 0; + return NULL; } return str; } diff --git a/include/asm-x86/io_apic.h b/include/asm-x86/io_apic.h index 8ca0110819f..a39670ae17d 100644 --- a/include/asm-x86/io_apic.h +++ b/include/asm-x86/io_apic.h @@ -160,12 +160,16 @@ extern int skip_ioapic_setup; /* 1 if "noapic" boot option passed */ extern int noioapicquirk; +/* -1 if "noapic" boot option passed */ +extern int noioapicreroute; + /* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */ extern int timer_through_8259; static inline void disable_ioapic_setup(void) { noioapicquirk = 1; + noioapicreroute = -1; skip_ioapic_setup = 1; } diff --git a/include/asm-x86/pci.h b/include/asm-x86/pci.h index 30eec93a845..52a29f7668e 100644 --- a/include/asm-x86/pci.h +++ b/include/asm-x86/pci.h @@ -20,6 +20,7 @@ struct pci_sysdata { extern int pci_routeirq; extern int noioapicquirk; +extern int ioapicreroute; /* scan a bus after allocating a pci_sysdata for it */ extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, -- cgit v1.2.3-70-g09d2 From 41b9eb264c8407655db57b60b4457fe1b2ec9977 Mon Sep 17 00:00:00 2001 From: Stefan Assmann Date: Tue, 15 Jul 2008 13:48:55 +0200 Subject: x86, pci: introduce config option for pci reroute quirks (was: [PATCH 0/3] Boot IRQ quirks for Broadcom and AMD/ATI) This is against linux-2.6-tip, branch pci-ioapic-boot-irq-quirks. From: Stefan Assmann Subject: Introduce config option for pci reroute quirks The config option X86_REROUTE_FOR_BROKEN_BOOT_IRQS is introduced to enable (or disable) the redirection of the interrupt handler to the boot interrupt line by default. Depending on the existence of interrupt masking / threaded interrupt handling in the kernel (vanilla, rt, ...) and the maturity of the rerouting patch, users can enable or disable the redirection by default. This means that the reroute quirk can be applied to any kernel without changing it. Interrupt sharing could be increased if this option is enabled. However this option is vital for threaded interrupt handling, as done by the RT kernel. It should simplify the consolidation with the RT kernel. The option can be overridden by either pci=ioapicreroute or pci=noioapicreroute. Signed-off-by: Stefan Assmann Signed-off-by: Olaf Dabrunz Cc: Jesse Barnes Cc: Jon Masters Cc: Ihno Krumreich Cc: Sven Dietrich Cc: Daniel Gollub Cc: Felix Foerster Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 4 ++++ arch/x86/Kconfig | 24 ++++++++++++++++++++++++ arch/x86/pci/common.c | 8 ++++++++ drivers/pci/quirks.c | 2 +- include/asm-x86/pci.h | 2 +- 5 files changed, 38 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index f5662b7a34d..62b6e8067a5 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1536,6 +1536,10 @@ and is between 256 and 4096 characters. It is defined in the file primary IO-APIC for bridges that cannot disable boot IRQs. This fixes a source of spurious IRQs when the system masks IRQs. + noioapicreroute [APIC] Disable workaround that uses the + boot IRQ equivalent of an IRQ that connects to + a chipset where boot IRQs cannot be disabled. + The opposite of ioapicreroute. biosirq [X86-32] Use PCI BIOS calls to get the interrupt routing table. These calls are known to be buggy on several machines and they hang the machine diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 96e0c2ebc38..09521332636 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -665,6 +665,30 @@ config X86_VISWS_APIC def_bool y depends on X86_32 && X86_VISWS +config X86_REROUTE_FOR_BROKEN_BOOT_IRQS + bool "Reroute for broken boot IRQs" + default n + depends on X86_IO_APIC + help + This option enables a workaround that fixes a source of + spurious interrupts. This is recommended when threaded + interrupt handling is used on systems where the generation of + superfluous "boot interrupts" cannot be disabled. + + Some chipsets generate a legacy INTx "boot IRQ" when the IRQ + entry in the chipset's IO-APIC is masked (as, e.g. the RT + kernel does during interrupt handling). On chipsets where this + boot IRQ generation cannot be disabled, this workaround keeps + the original IRQ line masked so that only the equivalent "boot + IRQ" is delivered to the CPUs. The workaround also tells the + kernel to set up the IRQ handler on the boot IRQ line. In this + way only one interrupt is delivered to the kernel. Otherwise + the spurious second interrupt may cause the kernel to bring + down (vital) interrupt lines. + + Only affects "broken" chipsets. Interrupt sharing may be + increased on these systems. + config X86_MCE bool "Machine Check Exception" depends on !X86_VOYAGER diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 1485a26ddce..bb1a01f089e 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -24,7 +24,11 @@ unsigned int pci_early_dump_regs; static int pci_bf_sort; int pci_routeirq; int noioapicquirk; +#ifdef CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS +int noioapicreroute = 0; +#else int noioapicreroute = 1; +#endif int pcibios_last_bus = -1; unsigned long pirq_table_addr; struct pci_bus *pci_root_bus; @@ -528,6 +532,10 @@ char * __devinit pcibios_setup(char *str) if (noioapicreroute != -1) noioapicreroute = 0; return NULL; + } else if (!strcmp(str, "noioapicreroute")) { + if (noioapicreroute != -1) + noioapicreroute = 1; + return NULL; } return str; } diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 0911b0c60b6..c880dd0bbfb 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -1397,7 +1397,7 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x260b, quirk_intel_pcie_pm); */ static void quirk_reroute_to_boot_interrupts_intel(struct pci_dev *dev) { - if (noioapicquirk) + if (noioapicquirk || noioapicreroute) return; dev->irq_reroute_variant = INTEL_IRQ_REROUTE_VARIANT; diff --git a/include/asm-x86/pci.h b/include/asm-x86/pci.h index 52a29f7668e..9584d6d5eb9 100644 --- a/include/asm-x86/pci.h +++ b/include/asm-x86/pci.h @@ -20,7 +20,7 @@ struct pci_sysdata { extern int pci_routeirq; extern int noioapicquirk; -extern int ioapicreroute; +extern int noioapicreroute; /* scan a bus after allocating a pci_sysdata for it */ extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, -- cgit v1.2.3-70-g09d2 From 1a960b402a51d80abf54e3f8e4972374ffe5f22d Mon Sep 17 00:00:00 2001 From: Jason Yeh Date: Wed, 23 Jul 2008 23:05:53 +0200 Subject: Oprofile Multiplexing Patch This patch introduces multiplexing support for the Oprofile kernel module. It basically adds a new function pointer in oprofile_operator allowing each architecture to supply its callback to switch between different sets of event when the timer expires. Userspace tools can modify the time slice through /dev/oprofile/time_slice. It also modifies the number of counters exposed to the userspace through /dev/oprofile. For example, the number of counters for AMD CPUs are changed to 32 and multiplexed in the sets of 4. Signed-off-by: Jason Yeh Signed-off-by: Robert Richter Cc: oprofile-list Signed-off-by: Ingo Molnar --- arch/x86/oprofile/nmi_int.c | 100 +++++++++++++++++++++++++++++++++++--- arch/x86/oprofile/op_counter.h | 3 +- arch/x86/oprofile/op_model_amd.c | 76 +++++++++++++++++------------ arch/x86/oprofile/op_model_p4.c | 4 ++ arch/x86/oprofile/op_model_ppro.c | 2 + arch/x86/oprofile/op_x86_model.h | 3 ++ drivers/oprofile/oprof.c | 58 ++++++++++++++++++++-- drivers/oprofile/oprof.h | 4 +- drivers/oprofile/oprofile_files.c | 39 ++++++++++++++- include/linux/oprofile.h | 3 ++ 10 files changed, 247 insertions(+), 45 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 287513a0981..2a65fe7680a 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -23,12 +23,18 @@ #include "op_counter.h" #include "op_x86_model.h" +DEFINE_PER_CPU(int, switch_index); + static struct op_x86_model_spec const *model; static DEFINE_PER_CPU(struct op_msrs, cpu_msrs); static DEFINE_PER_CPU(unsigned long, saved_lvtpc); static int nmi_start(void); static void nmi_stop(void); +static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs); +static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs); +static void nmi_cpu_stop(void *dummy); +static void nmi_cpu_start(void *dummy); /* 0 == registered but off, 1 == registered and on */ static int nmi_enabled = 0; @@ -81,6 +87,47 @@ static void exit_sysfs(void) #define exit_sysfs() do { } while (0) #endif /* CONFIG_PM */ +static void nmi_cpu_switch(void *dummy) +{ + int cpu = smp_processor_id(); + int si = per_cpu(switch_index, cpu); + struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); + + nmi_cpu_stop(NULL); + nmi_cpu_save_mpx_registers(msrs); + + /* move to next set */ + si += model->num_hardware_counters; + if ((si > model->num_counters) || (counter_config[si].count == 0)) + per_cpu(switch_index, smp_processor_id()) = 0; + else + per_cpu(switch_index, smp_processor_id()) = si; + + nmi_cpu_restore_mpx_registers(msrs); + model->setup_ctrs(msrs); + nmi_cpu_start(NULL); +} + +/* + * Quick check to see if multiplexing is necessary. + * The check should be sufficient since counters are used + * in ordre. + */ +static int nmi_multiplex_on(void) +{ + return counter_config[model->num_hardware_counters].count ? 0 : -EINVAL; +} + +static int nmi_switch_event(void) +{ + if (nmi_multiplex_on() < 0) + return -EINVAL; + + on_each_cpu(nmi_cpu_switch, NULL, 0, 1); + + return 0; +} + static int profile_exceptions_notify(struct notifier_block *self, unsigned long val, void *data) { @@ -144,11 +191,10 @@ static void free_msrs(void) static int allocate_msrs(void) { - int success = 1; + int i, success = 1; size_t controls_size = sizeof(struct op_msr) * model->num_controls; size_t counters_size = sizeof(struct op_msr) * model->num_counters; - int i; for_each_possible_cpu(i) { per_cpu(cpu_msrs, i).counters = kmalloc(counters_size, GFP_KERNEL); @@ -156,8 +202,8 @@ static int allocate_msrs(void) success = 0; break; } - per_cpu(cpu_msrs, i).controls = kmalloc(controls_size, - GFP_KERNEL); + per_cpu(cpu_msrs, i).controls = + kmalloc(controls_size, GFP_KERNEL); if (!per_cpu(cpu_msrs, i).controls) { success = 0; break; @@ -201,7 +247,8 @@ static int nmi_setup(void) return err; } - /* We need to serialize save and setup for HT because the subset + /* + * We need to serialize save and setup for HT because the subset * of msrs are distinct for save and setup operations */ @@ -217,7 +264,6 @@ static int nmi_setup(void) per_cpu(cpu_msrs, 0).controls, sizeof(struct op_msr) * model->num_controls); } - } on_each_cpu(nmi_save_registers, NULL, 1); on_each_cpu(nmi_cpu_setup, NULL, 1); @@ -225,7 +271,41 @@ static int nmi_setup(void) return 0; } -static void nmi_restore_registers(struct op_msrs *msrs) +static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs) +{ + unsigned int si = __get_cpu_var(switch_index); + unsigned int const nr_ctrs = model->num_hardware_counters; + struct op_msr *counters = &msrs->counters[si]; + unsigned int i; + + for (i = 0; i < nr_ctrs; ++i) { + int offset = i + si; + if (counters[offset].addr) { + rdmsr(counters[offset].addr, + counters[offset].multiplex.low, + counters[offset].multiplex.high); + } + } +} + +static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs) +{ + unsigned int si = __get_cpu_var(switch_index); + unsigned int const nr_ctrs = model->num_hardware_counters; + struct op_msr *counters = &msrs->counters[si]; + unsigned int i; + + for (i = 0; i < nr_ctrs; ++i) { + int offset = i + si; + if (counters[offset].addr) { + wrmsr(counters[offset].addr, + counters[offset].multiplex.low, + counters[offset].multiplex.high); + } + } +} + +static void nmi_cpu_restore_registers(struct op_msrs *msrs) { unsigned int const nr_ctrs = model->num_counters; unsigned int const nr_ctrls = model->num_controls; @@ -265,7 +345,8 @@ static void nmi_cpu_shutdown(void *dummy) apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu)); apic_write(APIC_LVTERR, v); - nmi_restore_registers(msrs); + nmi_cpu_restore_registers(msrs); + __get_cpu_var(switch_index) = 0; } static void nmi_shutdown(void) @@ -328,6 +409,7 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root) oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask); oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel); oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user); + counter_config[i].save_count_low = 0; } return 0; @@ -469,12 +551,14 @@ int __init op_nmi_init(struct oprofile_operations *ops) } /* default values, can be overwritten by model */ + __get_cpu_var(switch_index) = 0; ops->create_files = nmi_create_files; ops->setup = nmi_setup; ops->shutdown = nmi_shutdown; ops->start = nmi_start; ops->stop = nmi_stop; ops->cpu_type = cpu_type; + ops->switch_events = nmi_switch_event; if (model->init) ret = model->init(ops); diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h index 2880b15c467..786d6e01cf7 100644 --- a/arch/x86/oprofile/op_counter.h +++ b/arch/x86/oprofile/op_counter.h @@ -10,13 +10,14 @@ #ifndef OP_COUNTER_H #define OP_COUNTER_H -#define OP_MAX_COUNTER 8 +#define OP_MAX_COUNTER 32 /* Per-perfctr configuration as set via * oprofilefs. */ struct op_counter_config { unsigned long count; + unsigned long save_count_low; unsigned long enabled; unsigned long event; unsigned long kernel; diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index d9faf607b3a..bbf2b68bcc5 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -23,8 +24,10 @@ #include "op_x86_model.h" #include "op_counter.h" -#define NUM_COUNTERS 4 -#define NUM_CONTROLS 4 +#define NUM_COUNTERS 32 +#define NUM_HARDWARE_COUNTERS 4 +#define NUM_CONTROLS 32 +#define NUM_HARDWARE_CONTROLS 4 #define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) #define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0) @@ -48,6 +51,7 @@ #define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8)) static unsigned long reset_value[NUM_COUNTERS]; +DECLARE_PER_CPU(int, switch_index); #ifdef CONFIG_OPROFILE_IBS @@ -130,15 +134,17 @@ static void op_amd_fill_in_addresses(struct op_msrs * const msrs) int i; for (i = 0; i < NUM_COUNTERS; i++) { - if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) - msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; + int hw_counter = i % NUM_HARDWARE_COUNTERS; + if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + hw_counter)) + msrs->counters[i].addr = MSR_K7_PERFCTR0 + hw_counter; else msrs->counters[i].addr = 0; } for (i = 0; i < NUM_CONTROLS; i++) { - if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) - msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; + int hw_control = i % NUM_HARDWARE_CONTROLS; + if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + hw_control)) + msrs->controls[i].addr = MSR_K7_EVNTSEL0 + hw_control; else msrs->controls[i].addr = 0; } @@ -150,8 +156,16 @@ static void op_amd_setup_ctrs(struct op_msrs const * const msrs) unsigned int low, high; int i; + for (i = 0; i < NUM_HARDWARE_CONTROLS; ++i) { + int offset = i + __get_cpu_var(switch_index); + if (counter_config[offset].enabled) + reset_value[offset] = counter_config[offset].count; + else + reset_value[offset] = 0; + } + /* clear all counters */ - for (i = 0 ; i < NUM_CONTROLS; ++i) { + for (i = 0 ; i < NUM_HARDWARE_CONTROLS; ++i) { if (unlikely(!CTRL_IS_RESERVED(msrs, i))) continue; CTRL_READ(low, high, msrs, i); @@ -161,34 +175,31 @@ static void op_amd_setup_ctrs(struct op_msrs const * const msrs) } /* avoid a false detection of ctr overflows in NMI handler */ - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < NUM_HARDWARE_COUNTERS; ++i) { if (unlikely(!CTR_IS_RESERVED(msrs, i))) continue; CTR_WRITE(1, msrs, i); } /* enable active counters */ - for (i = 0; i < NUM_COUNTERS; ++i) { - if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { - reset_value[i] = counter_config[i].count; - - CTR_WRITE(counter_config[i].count, msrs, i); + for (i = 0; i < NUM_HARDWARE_COUNTERS; ++i) { + int offset = i + __get_cpu_var(switch_index); + if ((counter_config[offset].enabled) && (CTR_IS_RESERVED(msrs, i))) { + CTR_WRITE(counter_config[offset].count, msrs, i); CTRL_READ(low, high, msrs, i); CTRL_CLEAR_LO(low); CTRL_CLEAR_HI(high); CTRL_SET_ENABLE(low); - CTRL_SET_USR(low, counter_config[i].user); - CTRL_SET_KERN(low, counter_config[i].kernel); - CTRL_SET_UM(low, counter_config[i].unit_mask); - CTRL_SET_EVENT_LOW(low, counter_config[i].event); - CTRL_SET_EVENT_HIGH(high, counter_config[i].event); + CTRL_SET_USR(low, counter_config[offset].user); + CTRL_SET_KERN(low, counter_config[offset].kernel); + CTRL_SET_UM(low, counter_config[offset].unit_mask); + CTRL_SET_EVENT_LOW(low, counter_config[offset].event); + CTRL_SET_EVENT_HIGH(high, counter_config[offset].event); CTRL_SET_HOST_ONLY(high, 0); CTRL_SET_GUEST_ONLY(high, 0); CTRL_WRITE(low, high, msrs, i); - } else { - reset_value[i] = 0; } } } @@ -276,13 +287,14 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, unsigned int low, high; int i; - for (i = 0 ; i < NUM_COUNTERS; ++i) { - if (!reset_value[i]) + for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) { + int offset = i + __get_cpu_var(switch_index); + if (!reset_value[offset]) continue; CTR_READ(low, high, msrs, i); if (CTR_OVERFLOWED(low)) { - oprofile_add_sample(regs, i); - CTR_WRITE(reset_value[i], msrs, i); + oprofile_add_sample(regs, offset); + CTR_WRITE(reset_value[offset], msrs, i); } } @@ -298,8 +310,10 @@ static void op_amd_start(struct op_msrs const * const msrs) { unsigned int low, high; int i; - for (i = 0 ; i < NUM_COUNTERS ; ++i) { - if (reset_value[i]) { + + for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) { + int offset = i + __get_cpu_var(switch_index); + if (reset_value[offset]) { CTRL_READ(low, high, msrs, i); CTRL_SET_ACTIVE(low); CTRL_WRITE(low, high, msrs, i); @@ -329,8 +343,8 @@ static void op_amd_stop(struct op_msrs const * const msrs) /* Subtle: stop on all counters to avoid race with * setting our pm callback */ - for (i = 0 ; i < NUM_COUNTERS ; ++i) { - if (!reset_value[i]) + for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) { + if (!reset_value[i + per_cpu(switch_index, smp_processor_id())]) continue; CTRL_READ(low, high, msrs, i); CTRL_SET_INACTIVE(low); @@ -356,11 +370,11 @@ static void op_amd_shutdown(struct op_msrs const * const msrs) { int i; - for (i = 0 ; i < NUM_COUNTERS ; ++i) { + for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) { if (CTR_IS_RESERVED(msrs, i)) release_perfctr_nmi(MSR_K7_PERFCTR0 + i); } - for (i = 0 ; i < NUM_CONTROLS ; ++i) { + for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) { if (CTRL_IS_RESERVED(msrs, i)) release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); } @@ -534,6 +548,8 @@ struct op_x86_model_spec const op_amd_spec = { .exit = op_amd_exit, .num_counters = NUM_COUNTERS, .num_controls = NUM_CONTROLS, + .num_hardware_counters = NUM_HARDWARE_COUNTERS, + .num_hardware_controls = NUM_HARDWARE_CONTROLS, .fill_in_addresses = &op_amd_fill_in_addresses, .setup_ctrs = &op_amd_setup_ctrs, .check_ctrs = &op_amd_check_ctrs, diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 56b4757a1f4..e641545d479 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -701,6 +701,8 @@ static void p4_shutdown(struct op_msrs const * const msrs) struct op_x86_model_spec const op_p4_ht2_spec = { .num_counters = NUM_COUNTERS_HT2, .num_controls = NUM_CONTROLS_HT2, + .num_hardware_counters = NUM_COUNTERS_HT2, + .num_hardware_controls = NUM_CONTROLS_HT2, .fill_in_addresses = &p4_fill_in_addresses, .setup_ctrs = &p4_setup_ctrs, .check_ctrs = &p4_check_ctrs, @@ -713,6 +715,8 @@ struct op_x86_model_spec const op_p4_ht2_spec = { struct op_x86_model_spec const op_p4_spec = { .num_counters = NUM_COUNTERS_NON_HT, .num_controls = NUM_CONTROLS_NON_HT, + .num_hardware_counters = NUM_COUNTERS_NON_HT, + .num_hardware_controls = NUM_CONTROLS_NON_HT, .fill_in_addresses = &p4_fill_in_addresses, .setup_ctrs = &p4_setup_ctrs, .check_ctrs = &p4_check_ctrs, diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index eff431f6c57..e5811aa480e 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -183,6 +183,8 @@ static void ppro_shutdown(struct op_msrs const * const msrs) struct op_x86_model_spec const op_ppro_spec = { .num_counters = NUM_COUNTERS, .num_controls = NUM_CONTROLS, + .num_hardware_counters = NUM_COUNTERS, + .num_hardware_controls = NUM_CONTROLS, .fill_in_addresses = &ppro_fill_in_addresses, .setup_ctrs = &ppro_setup_ctrs, .check_ctrs = &ppro_check_ctrs, diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 05a0261ba0c..e07ba107637 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -19,6 +19,7 @@ struct op_saved_msr { struct op_msr { unsigned long addr; struct op_saved_msr saved; + struct op_saved_msr multiplex; }; struct op_msrs { @@ -34,6 +35,8 @@ struct pt_regs; struct op_x86_model_spec { int (*init)(struct oprofile_operations *ops); void (*exit)(void); + unsigned int const num_hardware_counters; + unsigned int const num_hardware_controls; unsigned int const num_counters; unsigned int const num_controls; void (*fill_in_addresses)(struct op_msrs * const msrs); diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c index 2c645170f06..b2fa5df64a6 100644 --- a/drivers/oprofile/oprof.c +++ b/drivers/oprofile/oprof.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include "oprof.h" @@ -19,13 +21,18 @@ #include "cpu_buffer.h" #include "buffer_sync.h" #include "oprofile_stats.h" + +static unsigned long is_setup; +static void switch_worker(struct work_struct *work); +static DECLARE_DELAYED_WORK(switch_work, switch_worker); +static DEFINE_MUTEX(start_mutex); struct oprofile_operations oprofile_ops; +unsigned long timeout_jiffies; unsigned long oprofile_started; unsigned long backtrace_depth; -static unsigned long is_setup; -static DEFINE_MUTEX(start_mutex); +/* Multiplexing defaults at 1 msec*/ /* timer 0 - use performance monitoring hardware if available @@ -87,6 +94,16 @@ out: return err; } +static void start_switch_worker(void) +{ + schedule_delayed_work(&switch_work, timeout_jiffies); +} + +static void switch_worker(struct work_struct *work) +{ + if (!oprofile_ops.switch_events()) + start_switch_worker(); +} /* Actually start profiling (echo 1>/dev/oprofile/enable) */ int oprofile_start(void) @@ -94,7 +111,6 @@ int oprofile_start(void) int err = -EINVAL; mutex_lock(&start_mutex); - if (!is_setup) goto out; @@ -108,6 +124,9 @@ int oprofile_start(void) if ((err = oprofile_ops.start())) goto out; + if (oprofile_ops.switch_events) + start_switch_worker(); + oprofile_started = 1; out: mutex_unlock(&start_mutex); @@ -123,6 +142,7 @@ void oprofile_stop(void) goto out; oprofile_ops.stop(); oprofile_started = 0; + cancel_delayed_work_sync(&switch_work); /* wake up the daemon to read what remains */ wake_up_buffer_waiter(); out: @@ -155,6 +175,32 @@ post_sync: mutex_unlock(&start_mutex); } +/* User inputs in ms, converts to jiffies */ +int oprofile_set_timeout(unsigned long val_msec) +{ + int err = 0; + + mutex_lock(&start_mutex); + + if (oprofile_started) { + err = -EBUSY; + goto out; + } + + if (!oprofile_ops.switch_events) { + err = -EINVAL; + goto out; + } + + timeout_jiffies = msecs_to_jiffies(val_msec); + if (timeout_jiffies == MAX_JIFFY_OFFSET) + timeout_jiffies = msecs_to_jiffies(1); + +out: + mutex_unlock(&start_mutex); + return err; + +} int oprofile_set_backtrace(unsigned long val) { @@ -179,10 +225,16 @@ out: return err; } +static void __init oprofile_switch_timer_init(void) +{ + timeout_jiffies = msecs_to_jiffies(1); +} + static int __init oprofile_init(void) { int err; + oprofile_switch_timer_init(); err = oprofile_arch_init(&oprofile_ops); if (err < 0 || timer) { diff --git a/drivers/oprofile/oprof.h b/drivers/oprofile/oprof.h index 18323650806..c4406a7366b 100644 --- a/drivers/oprofile/oprof.h +++ b/drivers/oprofile/oprof.h @@ -27,7 +27,8 @@ extern unsigned long fs_buffer_watershed; extern struct oprofile_operations oprofile_ops; extern unsigned long oprofile_started; extern unsigned long backtrace_depth; - +extern unsigned long timeout_jiffies; + struct super_block; struct dentry; @@ -35,5 +36,6 @@ void oprofile_create_files(struct super_block * sb, struct dentry * root); void oprofile_timer_init(struct oprofile_operations * ops); int oprofile_set_backtrace(unsigned long depth); +int oprofile_set_timeout(unsigned long time); #endif /* OPROF_H */ diff --git a/drivers/oprofile/oprofile_files.c b/drivers/oprofile/oprofile_files.c index ef953ba5ab6..cc4f5a1f8ef 100644 --- a/drivers/oprofile/oprofile_files.c +++ b/drivers/oprofile/oprofile_files.c @@ -9,6 +9,7 @@ #include #include +#include #include "event_buffer.h" #include "oprofile_stats.h" @@ -18,6 +19,40 @@ unsigned long fs_buffer_size = 131072; unsigned long fs_cpu_buffer_size = 8192; unsigned long fs_buffer_watershed = 32768; /* FIXME: tune */ +static ssize_t timeout_read(struct file *file, char __user *buf, + size_t count, loff_t *offset) +{ + return oprofilefs_ulong_to_user(jiffies_to_msecs(timeout_jiffies), + buf, count, offset); +} + + +static ssize_t timeout_write(struct file *file, char const __user *buf, + size_t count, loff_t *offset) +{ + unsigned long val; + int retval; + + if (*offset) + return -EINVAL; + + retval = oprofilefs_ulong_from_user(&val, buf, count); + if (retval) + return retval; + + retval = oprofile_set_timeout(val); + + if (retval) + return retval; + return count; +} + +static const struct file_operations timeout_fops = { + .read = timeout_read, + .write = timeout_write, +}; + + static ssize_t depth_read(struct file * file, char __user * buf, size_t count, loff_t * offset) { return oprofilefs_ulong_to_user(backtrace_depth, buf, count, offset); @@ -85,11 +120,10 @@ static ssize_t enable_write(struct file * file, char const __user * buf, size_t if (*offset) return -EINVAL; - retval = oprofilefs_ulong_from_user(&val, buf, count); if (retval) return retval; - + if (val) retval = oprofile_start(); else @@ -129,6 +163,7 @@ void oprofile_create_files(struct super_block * sb, struct dentry * root) oprofilefs_create_file(sb, root, "cpu_type", &cpu_type_fops); oprofilefs_create_file(sb, root, "backtrace_depth", &depth_fops); oprofilefs_create_file(sb, root, "pointer_size", &pointer_size_fops); + oprofilefs_create_file(sb, root, "timeout_ms", &timeout_fops); oprofile_create_stats_files(sb, root); if (oprofile_ops.create_files) oprofile_ops.create_files(sb, root); diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h index bcb8f725427..687f2f4c36a 100644 --- a/include/linux/oprofile.h +++ b/include/linux/oprofile.h @@ -67,6 +67,9 @@ struct oprofile_operations { /* Initiate a stack backtrace. Optional. */ void (*backtrace)(struct pt_regs * const regs, unsigned int depth); + + /* Multiplex between different events. Optional. */ + int (*switch_events)(void); /* CPU identification string. */ char * cpu_type; }; -- cgit v1.2.3-70-g09d2 From 7e7b43892b87b6be259479ef4de14029dcb4012f Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 24 Jul 2008 16:00:16 +0200 Subject: x86/oprofile: fix on_each_cpu build error Signed-off-by: Robert Richter Cc: oprofile-list Cc: Jason Yeh Signed-off-by: Ingo Molnar --- arch/x86/oprofile/nmi_int.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 2a65fe7680a..fb4902bc6f1 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -123,7 +123,7 @@ static int nmi_switch_event(void) if (nmi_multiplex_on() < 0) return -EINVAL; - on_each_cpu(nmi_cpu_switch, NULL, 0, 1); + on_each_cpu(nmi_cpu_switch, NULL, 1); return 0; } -- cgit v1.2.3-70-g09d2 From beb20d52d03a51218827fb4a36a4b583debb03f9 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 1 Sep 2008 14:55:57 -0700 Subject: hrtimer: convert kvm to the new hrtimer apis In order to be able to do range hrtimers we need to use accessor functions to the "expire" member of the hrtimer struct. This patch converts KVM to these accessors. Signed-off-by: Arjan van de Ven --- arch/x86/kvm/i8254.c | 6 +++--- arch/x86/kvm/lapic.c | 6 ++---- 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index c0f7872a912..1bf8f57a304 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -205,8 +205,8 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps) wake_up_interruptible(&vcpu0->wq); } - pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); - pt->scheduled = ktime_to_ns(pt->timer.expires); + hrtimer_add_expires_ns(&pt->timer, pt->period); + pt->scheduled = ktime_to_ns(hrtimer_get_expires(&pt->timer)); return (pt->period == 0 ? 0 : 1); } @@ -246,7 +246,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) timer = &pit->pit_state.pit_timer.timer; if (hrtimer_cancel(timer)) - hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS); + hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } static void destroy_pit_timer(struct kvm_kpit_timer *pt) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 73f43de69f6..a5b61de6adf 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -953,9 +953,7 @@ static int __apic_timer_fn(struct kvm_lapic *apic) } if (apic_lvtt_period(apic)) { result = 1; - apic->timer.dev.expires = ktime_add_ns( - apic->timer.dev.expires, - apic->timer.period); + hrtimer_add_expires_ns(&apic->timer.dev, apic->timer.period); } return result; } @@ -1124,7 +1122,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) timer = &apic->timer.dev; if (hrtimer_cancel(timer)) - hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS); + hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) -- cgit v1.2.3-70-g09d2 From c10d38dda1774ed4540380333cabd229eff37094 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 10 Sep 2008 13:37:17 +0200 Subject: x86: some lock annotations for user copy paths copy_to/from_user and all its variants (except the atomic ones) can take a page fault and perform non-trivial work like taking mmap_sem and entering the filesyste/pagecache. Unfortunately, this often escapes lockdep because a common pattern is to use it to read in some arguments just set up from userspace, or write data back to a hot buffer. In those cases, it will be unlikely for page reclaim to get a window in to cause copy_*_user to fault. With the new might_lock primitives, add some annotations to x86. I don't know if I caught all possible faulting points (it's a bit of a maze, and I didn't really look at 32-bit). But this is a starting point. Boots and runs OK so far. Signed-off-by: Nick Piggin Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/lib/usercopy_32.c | 7 ++++++- arch/x86/lib/usercopy_64.c | 4 ++++ include/asm-x86/uaccess.h | 14 ++++++++++++++ include/asm-x86/uaccess_32.h | 10 ++++++++-- include/asm-x86/uaccess_64.h | 12 ++++++++++++ 5 files changed, 44 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 24e60944971..8eedde2a9ca 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -33,6 +33,8 @@ static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned lon do { \ int __d0, __d1, __d2; \ might_sleep(); \ + if (current->mm) \ + might_lock_read(¤t->mm->mmap_sem); \ __asm__ __volatile__( \ " testl %1,%1\n" \ " jz 2f\n" \ @@ -120,6 +122,8 @@ EXPORT_SYMBOL(strncpy_from_user); do { \ int __d0; \ might_sleep(); \ + if (current->mm) \ + might_lock_read(¤t->mm->mmap_sem); \ __asm__ __volatile__( \ "0: rep; stosl\n" \ " movl %2,%0\n" \ @@ -148,7 +152,6 @@ do { \ unsigned long clear_user(void __user *to, unsigned long n) { - might_sleep(); if (access_ok(VERIFY_WRITE, to, n)) __do_clear_user(to, n); return n; @@ -191,6 +194,8 @@ long strnlen_user(const char __user *s, long n) unsigned long res, tmp; might_sleep(); + if (current->mm) + might_lock_read(¤t->mm->mmap_sem); __asm__ __volatile__( " testl %0, %0\n" diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index f4df6e7c718..847d1294599 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -16,6 +16,8 @@ do { \ long __d0, __d1, __d2; \ might_sleep(); \ + if (current->mm) \ + might_lock_read(¤t->mm->mmap_sem); \ __asm__ __volatile__( \ " testq %1,%1\n" \ " jz 2f\n" \ @@ -65,6 +67,8 @@ unsigned long __clear_user(void __user *addr, unsigned long size) { long __d0; might_sleep(); + if (current->mm) + might_lock_read(¤t->mm->mmap_sem); /* no memory constraint because it doesn't change any memory gcc knows about */ asm volatile( diff --git a/include/asm-x86/uaccess.h b/include/asm-x86/uaccess.h index 5f702d1d521..ad29752a171 100644 --- a/include/asm-x86/uaccess.h +++ b/include/asm-x86/uaccess.h @@ -8,6 +8,8 @@ #include #include #include +#include +#include #include #include @@ -157,6 +159,9 @@ extern int __get_user_bad(void); int __ret_gu; \ unsigned long __val_gu; \ __chk_user_ptr(ptr); \ + might_sleep(); \ + if (current->mm) \ + might_lock_read(¤t->mm->mmap_sem); \ switch (sizeof(*(ptr))) { \ case 1: \ __get_user_x(1, __ret_gu, __val_gu, ptr); \ @@ -241,6 +246,9 @@ extern void __put_user_8(void); int __ret_pu; \ __typeof__(*(ptr)) __pu_val; \ __chk_user_ptr(ptr); \ + might_sleep(); \ + if (current->mm) \ + might_lock_read(¤t->mm->mmap_sem); \ __pu_val = x; \ switch (sizeof(*(ptr))) { \ case 1: \ @@ -265,6 +273,9 @@ extern void __put_user_8(void); #define __put_user_size(x, ptr, size, retval, errret) \ do { \ retval = 0; \ + might_sleep(); \ + if (current->mm) \ + might_lock_read(¤t->mm->mmap_sem); \ __chk_user_ptr(ptr); \ switch (size) { \ case 1: \ @@ -317,6 +328,9 @@ do { \ #define __get_user_size(x, ptr, size, retval, errret) \ do { \ retval = 0; \ + might_sleep(); \ + if (current->mm) \ + might_lock_read(¤t->mm->mmap_sem); \ __chk_user_ptr(ptr); \ switch (size) { \ case 1: \ diff --git a/include/asm-x86/uaccess_32.h b/include/asm-x86/uaccess_32.h index 6fdef39a0bc..d725e2d703f 100644 --- a/include/asm-x86/uaccess_32.h +++ b/include/asm-x86/uaccess_32.h @@ -82,8 +82,10 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) static __always_inline unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n) { - might_sleep(); - return __copy_to_user_inatomic(to, from, n); + might_sleep(); + if (current->mm) + might_lock_read(¤t->mm->mmap_sem); + return __copy_to_user_inatomic(to, from, n); } static __always_inline unsigned long @@ -138,6 +140,8 @@ static __always_inline unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { might_sleep(); + if (current->mm) + might_lock_read(¤t->mm->mmap_sem); if (__builtin_constant_p(n)) { unsigned long ret; @@ -160,6 +164,8 @@ static __always_inline unsigned long __copy_from_user_nocache(void *to, const void __user *from, unsigned long n) { might_sleep(); + if (current->mm) + might_lock_read(¤t->mm->mmap_sem); if (__builtin_constant_p(n)) { unsigned long ret; diff --git a/include/asm-x86/uaccess_64.h b/include/asm-x86/uaccess_64.h index 515d4dce96b..40a7205fe57 100644 --- a/include/asm-x86/uaccess_64.h +++ b/include/asm-x86/uaccess_64.h @@ -28,6 +28,10 @@ static __always_inline __must_check int __copy_from_user(void *dst, const void __user *src, unsigned size) { int ret = 0; + + might_sleep(); + if (current->mm) + might_lock_read(¤t->mm->mmap_sem); if (!__builtin_constant_p(size)) return copy_user_generic(dst, (__force void *)src, size); switch (size) { @@ -70,6 +74,10 @@ static __always_inline __must_check int __copy_to_user(void __user *dst, const void *src, unsigned size) { int ret = 0; + + might_sleep(); + if (current->mm) + might_lock_read(¤t->mm->mmap_sem); if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst, src, size); switch (size) { @@ -112,6 +120,10 @@ static __always_inline __must_check int __copy_in_user(void __user *dst, const void __user *src, unsigned size) { int ret = 0; + + might_sleep(); + if (current->mm) + might_lock_read(¤t->mm->mmap_sem); if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst, (__force void *)src, size); -- cgit v1.2.3-70-g09d2 From 3ee1afa308f2a38e5d1e2ad3752ad7abcf480da1 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 10 Sep 2008 13:37:17 +0200 Subject: x86: some lock annotations for user copy paths, v2 - introduce might_fault() - handle the atomic user copy paths correctly [ mingo@elte.hu: move might_sleep() outside of in_atomic(). ] Signed-off-by: Nick Piggin Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/lib/usercopy_32.c | 12 +++--------- arch/x86/lib/usercopy_64.c | 8 ++------ include/asm-x86/uaccess.h | 18 ++++-------------- include/asm-x86/uaccess_32.h | 12 +++--------- include/asm-x86/uaccess_64.h | 12 +++--------- include/linux/kernel.h | 9 +++++++++ mm/memory.c | 15 +++++++++++++++ 7 files changed, 39 insertions(+), 47 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 8eedde2a9ca..7393152a252 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -32,9 +32,7 @@ static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned lon #define __do_strncpy_from_user(dst, src, count, res) \ do { \ int __d0, __d1, __d2; \ - might_sleep(); \ - if (current->mm) \ - might_lock_read(¤t->mm->mmap_sem); \ + might_fault(); \ __asm__ __volatile__( \ " testl %1,%1\n" \ " jz 2f\n" \ @@ -121,9 +119,7 @@ EXPORT_SYMBOL(strncpy_from_user); #define __do_clear_user(addr,size) \ do { \ int __d0; \ - might_sleep(); \ - if (current->mm) \ - might_lock_read(¤t->mm->mmap_sem); \ + might_fault(); \ __asm__ __volatile__( \ "0: rep; stosl\n" \ " movl %2,%0\n" \ @@ -193,9 +189,7 @@ long strnlen_user(const char __user *s, long n) unsigned long mask = -__addr_ok(s); unsigned long res, tmp; - might_sleep(); - if (current->mm) - might_lock_read(¤t->mm->mmap_sem); + might_fault(); __asm__ __volatile__( " testl %0, %0\n" diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 847d1294599..64d6c84e635 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -15,9 +15,7 @@ #define __do_strncpy_from_user(dst,src,count,res) \ do { \ long __d0, __d1, __d2; \ - might_sleep(); \ - if (current->mm) \ - might_lock_read(¤t->mm->mmap_sem); \ + might_fault(); \ __asm__ __volatile__( \ " testq %1,%1\n" \ " jz 2f\n" \ @@ -66,9 +64,7 @@ EXPORT_SYMBOL(strncpy_from_user); unsigned long __clear_user(void __user *addr, unsigned long size) { long __d0; - might_sleep(); - if (current->mm) - might_lock_read(¤t->mm->mmap_sem); + might_fault(); /* no memory constraint because it doesn't change any memory gcc knows about */ asm volatile( diff --git a/include/asm-x86/uaccess.h b/include/asm-x86/uaccess.h index ad29752a171..39f8420c75d 100644 --- a/include/asm-x86/uaccess.h +++ b/include/asm-x86/uaccess.h @@ -8,8 +8,6 @@ #include #include #include -#include -#include #include #include @@ -159,9 +157,7 @@ extern int __get_user_bad(void); int __ret_gu; \ unsigned long __val_gu; \ __chk_user_ptr(ptr); \ - might_sleep(); \ - if (current->mm) \ - might_lock_read(¤t->mm->mmap_sem); \ + might_fault(); \ switch (sizeof(*(ptr))) { \ case 1: \ __get_user_x(1, __ret_gu, __val_gu, ptr); \ @@ -246,9 +242,7 @@ extern void __put_user_8(void); int __ret_pu; \ __typeof__(*(ptr)) __pu_val; \ __chk_user_ptr(ptr); \ - might_sleep(); \ - if (current->mm) \ - might_lock_read(¤t->mm->mmap_sem); \ + might_fault(); \ __pu_val = x; \ switch (sizeof(*(ptr))) { \ case 1: \ @@ -273,9 +267,7 @@ extern void __put_user_8(void); #define __put_user_size(x, ptr, size, retval, errret) \ do { \ retval = 0; \ - might_sleep(); \ - if (current->mm) \ - might_lock_read(¤t->mm->mmap_sem); \ + might_fault(); \ __chk_user_ptr(ptr); \ switch (size) { \ case 1: \ @@ -328,9 +320,7 @@ do { \ #define __get_user_size(x, ptr, size, retval, errret) \ do { \ retval = 0; \ - might_sleep(); \ - if (current->mm) \ - might_lock_read(¤t->mm->mmap_sem); \ + might_fault(); \ __chk_user_ptr(ptr); \ switch (size) { \ case 1: \ diff --git a/include/asm-x86/uaccess_32.h b/include/asm-x86/uaccess_32.h index d725e2d703f..d10e842ec3e 100644 --- a/include/asm-x86/uaccess_32.h +++ b/include/asm-x86/uaccess_32.h @@ -82,9 +82,7 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) static __always_inline unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n) { - might_sleep(); - if (current->mm) - might_lock_read(¤t->mm->mmap_sem); + might_fault(); return __copy_to_user_inatomic(to, from, n); } @@ -139,9 +137,7 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) static __always_inline unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { - might_sleep(); - if (current->mm) - might_lock_read(¤t->mm->mmap_sem); + might_fault(); if (__builtin_constant_p(n)) { unsigned long ret; @@ -163,9 +159,7 @@ __copy_from_user(void *to, const void __user *from, unsigned long n) static __always_inline unsigned long __copy_from_user_nocache(void *to, const void __user *from, unsigned long n) { - might_sleep(); - if (current->mm) - might_lock_read(¤t->mm->mmap_sem); + might_fault(); if (__builtin_constant_p(n)) { unsigned long ret; diff --git a/include/asm-x86/uaccess_64.h b/include/asm-x86/uaccess_64.h index 40a7205fe57..13fd56fbc3a 100644 --- a/include/asm-x86/uaccess_64.h +++ b/include/asm-x86/uaccess_64.h @@ -29,9 +29,7 @@ int __copy_from_user(void *dst, const void __user *src, unsigned size) { int ret = 0; - might_sleep(); - if (current->mm) - might_lock_read(¤t->mm->mmap_sem); + might_fault(); if (!__builtin_constant_p(size)) return copy_user_generic(dst, (__force void *)src, size); switch (size) { @@ -75,9 +73,7 @@ int __copy_to_user(void __user *dst, const void *src, unsigned size) { int ret = 0; - might_sleep(); - if (current->mm) - might_lock_read(¤t->mm->mmap_sem); + might_fault(); if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst, src, size); switch (size) { @@ -121,9 +117,7 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size) { int ret = 0; - might_sleep(); - if (current->mm) - might_lock_read(¤t->mm->mmap_sem); + might_fault(); if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst, (__force void *)src, size); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 2651f805ba6..e580ec09576 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -140,6 +140,15 @@ extern int _cond_resched(void); (__x < 0) ? -__x : __x; \ }) +#ifdef CONFIG_PROVE_LOCKING +void might_fault(void); +#else +static inline void might_fault(void) +{ + might_sleep(); +} +#endif + extern struct atomic_notifier_head panic_notifier_list; extern long (*panic_blink)(long time); NORET_TYPE void panic(const char * fmt, ...) diff --git a/mm/memory.c b/mm/memory.c index 1002f473f49..b8fdf4e5e65 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3016,3 +3016,18 @@ void print_vma_addr(char *prefix, unsigned long ip) } up_read(¤t->mm->mmap_sem); } + +#ifdef CONFIG_PROVE_LOCKING +void might_fault(void) +{ + might_sleep(); + /* + * it would be nicer only to annotate paths which are not under + * pagefault_disable, however that requires a larger audit and + * providing helpers like get_user_atomic. + */ + if (!in_atomic() && current->mm) + might_lock_read(¤t->mm->mmap_sem); +} +EXPORT_SYMBOL(might_fault); +#endif -- cgit v1.2.3-70-g09d2 From 1d18ef489509314506328b9e464dd47c24c1d68f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Sep 2008 20:53:21 +0200 Subject: x86: some lock annotations for user copy paths, v3 - add annotation back to clear_user() - change probe_kernel_address() to _inatomic*() method Signed-off-by: Ingo Molnar --- arch/x86/lib/usercopy_32.c | 1 + include/asm-x86/uaccess.h | 2 -- include/linux/uaccess.h | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 7393152a252..fab5faba1d3 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -148,6 +148,7 @@ do { \ unsigned long clear_user(void __user *to, unsigned long n) { + might_fault(); if (access_ok(VERIFY_WRITE, to, n)) __do_clear_user(to, n); return n; diff --git a/include/asm-x86/uaccess.h b/include/asm-x86/uaccess.h index 39f8420c75d..dc8edb5c465 100644 --- a/include/asm-x86/uaccess.h +++ b/include/asm-x86/uaccess.h @@ -267,7 +267,6 @@ extern void __put_user_8(void); #define __put_user_size(x, ptr, size, retval, errret) \ do { \ retval = 0; \ - might_fault(); \ __chk_user_ptr(ptr); \ switch (size) { \ case 1: \ @@ -320,7 +319,6 @@ do { \ #define __get_user_size(x, ptr, size, retval, errret) \ do { \ retval = 0; \ - might_fault(); \ __chk_user_ptr(ptr); \ switch (size) { \ case 1: \ diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index fec6decfb98..2062293e57e 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -78,7 +78,7 @@ static inline unsigned long __copy_from_user_nocache(void *to, \ set_fs(KERNEL_DS); \ pagefault_disable(); \ - ret = __get_user(retval, (__force typeof(retval) __user *)(addr)); \ + ret = __copy_from_user_inatomic((__force typeof(retval) __user *)(addr), &(retval), sizeof(retval)); \ pagefault_enable(); \ set_fs(old_fs); \ ret; \ -- cgit v1.2.3-70-g09d2 From 600715dcdf567c86f8b2c6173fcfb4b873e25a19 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 11 Sep 2008 01:31:45 -0700 Subject: generic: add phys_addr_t for holding physical addresses Add a kernel-wide "phys_addr_t" which is guaranteed to be able to hold any physical address. By default it equals the word size of the architecture, but a 32-bit architecture can set ARCH_PHYS_ADDR_T_64BIT if it needs a 64-bit phys_addr_t. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/powerpc/Kconfig | 3 +++ arch/powerpc/include/asm/types.h | 7 ------- arch/x86/Kconfig | 3 +++ include/asm-x86/page_32.h | 2 -- include/asm-x86/page_64.h | 1 - include/linux/types.h | 6 ++++++ mm/Kconfig | 3 +++ 7 files changed, 15 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 587da5e0990..f5f83ee6041 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -22,6 +22,9 @@ config WORD_SIZE config PPC_MERGE def_bool y +config ARCH_PHYS_ADDR_T_64BIT + def_bool PPC64 || PHYS_64BIT + config MMU bool default y diff --git a/arch/powerpc/include/asm/types.h b/arch/powerpc/include/asm/types.h index d3374bc865b..c646f34c4e8 100644 --- a/arch/powerpc/include/asm/types.h +++ b/arch/powerpc/include/asm/types.h @@ -48,13 +48,6 @@ typedef struct { typedef __vector128 vector128; -/* Physical address used by some IO functions */ -#if defined(CONFIG_PPC64) || defined(CONFIG_PHYS_64BIT) -typedef u64 phys_addr_t; -#else -typedef u32 phys_addr_t; -#endif - #ifdef __powerpc64__ typedef u64 dma_addr_t; #else diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ed92864d132..a0ffb5188c8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -932,6 +932,9 @@ config X86_PAE has the cost of more pagetable lookup overhead, and also consumes more pagetable space per process. +config ARCH_PHYS_ADDR_T_64BIT + def_bool X86_64 || X86_PAE + # Common NUMA Features config NUMA bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" diff --git a/include/asm-x86/page_32.h b/include/asm-x86/page_32.h index ab8528793f0..d0e58605a52 100644 --- a/include/asm-x86/page_32.h +++ b/include/asm-x86/page_32.h @@ -33,7 +33,6 @@ typedef u64 pmdval_t; typedef u64 pudval_t; typedef u64 pgdval_t; typedef u64 pgprotval_t; -typedef u64 phys_addr_t; typedef union { struct { @@ -54,7 +53,6 @@ typedef unsigned long pmdval_t; typedef unsigned long pudval_t; typedef unsigned long pgdval_t; typedef unsigned long pgprotval_t; -typedef unsigned long phys_addr_t; typedef union { pteval_t pte; diff --git a/include/asm-x86/page_64.h b/include/asm-x86/page_64.h index c6916c83e6b..2456fbf2d7d 100644 --- a/include/asm-x86/page_64.h +++ b/include/asm-x86/page_64.h @@ -79,7 +79,6 @@ typedef unsigned long pmdval_t; typedef unsigned long pudval_t; typedef unsigned long pgdval_t; typedef unsigned long pgprotval_t; -typedef unsigned long phys_addr_t; typedef struct page *pgtable_t; diff --git a/include/linux/types.h b/include/linux/types.h index d4a9ce6e276..022c668496d 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -197,6 +197,12 @@ typedef u64 resource_size_t; typedef u32 resource_size_t; #endif +#ifdef CONFIG_PHYS_ADDR_T_64BIT +typedef u64 phys_addr_t; +#else +typedef u32 phys_addr_t; +#endif + struct ustat { __kernel_daddr_t f_tfree; __kernel_ino_t f_tinode; diff --git a/mm/Kconfig b/mm/Kconfig index 0bd9c2dbb2a..91ee3922510 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -187,6 +187,9 @@ config RESOURCES_64BIT help This option allows memory and IO resources to be 64 bit. +config PHYS_ADDR_T_64BIT + def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT + config ZONE_DMA_FLAG int default "0" if !ZONE_DMA -- cgit v1.2.3-70-g09d2 From 8308c54d7e312f7a03e2ce2057d0837e6fe3843f Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 11 Sep 2008 01:31:50 -0700 Subject: generic: redefine resource_size_t as phys_addr_t There's no good reason why a resource_size_t shouldn't just be a physical address, so simply redefine it in terms of phys_addr_t. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/powerpc/platforms/Kconfig.cputype | 1 - arch/powerpc/sysdev/ppc4xx_pci.c | 16 ++++++---------- arch/x86/Kconfig | 1 - arch/x86/kernel/e820.c | 4 +--- drivers/pci/setup-bus.c | 9 ++++----- include/linux/types.h | 8 ++------ 6 files changed, 13 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 7f651273386..be852fd407a 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -135,7 +135,6 @@ config PTE_64BIT config PHYS_64BIT bool 'Large physical address support' if E500 depends on 44x || E500 - select RESOURCES_64BIT default y if 44x ---help--- This option enables kernel support for larger than 32-bit physical diff --git a/arch/powerpc/sysdev/ppc4xx_pci.c b/arch/powerpc/sysdev/ppc4xx_pci.c index fb368dfde5d..e8a76d9539d 100644 --- a/arch/powerpc/sysdev/ppc4xx_pci.c +++ b/arch/powerpc/sysdev/ppc4xx_pci.c @@ -41,13 +41,10 @@ extern unsigned long total_memory; #define U64_TO_U32_LOW(val) ((u32)((val) & 0x00000000ffffffffULL)) #define U64_TO_U32_HIGH(val) ((u32)((val) >> 32)) -#ifdef CONFIG_RESOURCES_64BIT -#define RES_TO_U32_LOW(val) U64_TO_U32_LOW(val) -#define RES_TO_U32_HIGH(val) U64_TO_U32_HIGH(val) -#else -#define RES_TO_U32_LOW(val) (val) -#define RES_TO_U32_HIGH(val) (0) -#endif +#define RES_TO_U32_LOW(val) \ + ((sizeof(resource_size_t) > sizeof(u32)) ? U64_TO_U32_LOW(val) : (val)) +#define RES_TO_U32_HIGH(val) \ + ((sizeof(resource_size_t) > sizeof(u32)) ? U64_TO_U32_HIGH(val) : (0)) static inline int ppc440spe_revA(void) { @@ -145,12 +142,11 @@ static int __init ppc4xx_parse_dma_ranges(struct pci_controller *hose, /* Use that */ res->start = pci_addr; -#ifndef CONFIG_RESOURCES_64BIT /* Beware of 32 bits resources */ - if ((pci_addr + size) > 0x100000000ull) + if (sizeof(resource_size_t) == sizeof(u32) && + (pci_addr + size) > 0x100000000ull) res->end = 0xffffffff; else -#endif res->end = res->start + size - 1; break; } diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a0ffb5188c8..b4e1875f986 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -925,7 +925,6 @@ config X86_PAE def_bool n prompt "PAE (Physical Address Extension) Support" depends on X86_32 && !HIGHMEM4G - select RESOURCES_64BIT help PAE is required for NX support, and furthermore enables larger swapspace support for non-overcommit purposes. It diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 66e48aa2dd1..477f4bb7e55 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1276,12 +1276,10 @@ void __init e820_reserve_resources(void) res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); for (i = 0; i < e820.nr_map; i++) { end = e820.map[i].addr + e820.map[i].size - 1; -#ifndef CONFIG_RESOURCES_64BIT - if (end > 0x100000000ULL) { + if (end != (resource_size_t)end) { res++; continue; } -#endif res->name = e820_type_to_string(e820.map[i].type); res->start = e820.map[i].addr; res->end = end; diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 1aad599816f..f250a90ee45 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -378,11 +378,10 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, unsigned long align = 0; min_align = 0; for (order = 0; order <= max_order; order++) { -#ifdef CONFIG_RESOURCES_64BIT - resource_size_t align1 = 1ULL << (order + 20); -#else - resource_size_t align1 = 1U << (order + 20); -#endif + resource_size_t align1 = 1; + + align1 <<= (order + 20); + if (!align) min_align = align1; else if (ALIGN(align + min_align, min_align) < align1) diff --git a/include/linux/types.h b/include/linux/types.h index 022c668496d..f24f7beb47d 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -191,18 +191,14 @@ typedef __u32 __bitwise __wsum; #ifdef __KERNEL__ typedef unsigned __bitwise__ gfp_t; -#ifdef CONFIG_RESOURCES_64BIT -typedef u64 resource_size_t; -#else -typedef u32 resource_size_t; -#endif - #ifdef CONFIG_PHYS_ADDR_T_64BIT typedef u64 phys_addr_t; #else typedef u32 phys_addr_t; #endif +typedef phys_addr_t resource_size_t; + struct ustat { __kernel_daddr_t f_tfree; __kernel_ino_t f_tinode; -- cgit v1.2.3-70-g09d2 From 45f197ade73ba95681b9803680c75352fc0a1c0a Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Sat, 20 Sep 2008 12:58:40 +0200 Subject: x86, oprofile: BUG: using smp_processor_id() in preemptible code Add __raw access before setting per cpu variable switch_index, to avoid the following BUG: [ 449.166827] BUG: using smp_processor_id() in preemptible [00000000] code: modprobe/6998 [ 449.166848] caller is op_nmi_init+0xf0/0x2b0 [oprofile] [ 449.166855] Pid: 6998, comm: modprobe Not tainted 2.6.27-rc5-mm1 #29 [ 449.166860] Call Trace: [ 449.166872] [] debug_smp_processor_id+0xd7/0xe0 [ 449.166887] [] op_nmi_init+0xf0/0x2b0 [oprofile] [ 449.166902] [] oprofile_init+0x0/0x60 [oprofile] [ 449.166915] [] oprofile_arch_init+0x9/0x30 [oprofile] [ 449.166928] [] oprofile_init+0x1e/0x60 [oprofile] [ 449.166937] [] _stext+0x3b/0x160 [ 449.166946] [] __mutex_unlock_slowpath+0xe5/0x190 [ 449.166955] [] trace_hardirqs_on_caller+0xca/0x140 [ 449.166965] [] sys_init_module+0xdc/0x210 [ 449.166972] [] system_call_fastpath+0x16/0x1b Signed-off-by: Andrea Righi Acked-by: Robert Richter Signed-off-by: Ingo Molnar --- arch/x86/oprofile/nmi_int.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index fb4902bc6f1..4108d02c529 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -551,7 +551,7 @@ int __init op_nmi_init(struct oprofile_operations *ops) } /* default values, can be overwritten by model */ - __get_cpu_var(switch_index) = 0; + __raw_get_cpu_var(switch_index) = 0; ops->create_files = nmi_create_files; ops->setup = nmi_setup; ops->shutdown = nmi_shutdown; -- cgit v1.2.3-70-g09d2 From 2fd47094f92fa2bdbf99be33294a7b6b97785a70 Mon Sep 17 00:00:00 2001 From: Thomas Renninger Date: Mon, 1 Sep 2008 14:27:03 +0200 Subject: CPUFREQ: powernow-k8: Try to detect old BIOS, not supporting CPU freq on a recent AMD CPUs. Make use of FW_BUG interface to give vendors and users the ability to automatically check for powernow-k8 related BIOS bugs by: dmesg |grep "Firmware Bug" Signed-off-by: Thomas Renninger Signed-off-by: Andi Kleen Signed-off-by: Len Brown --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 42 ++++++++++++++++++------------- 1 file changed, 24 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 84bb395038d..4e0c6abd7ca 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -45,7 +45,6 @@ #endif #define PFX "powernow-k8: " -#define BFX PFX "BIOS error: " #define VERSION "version 2.20.00" #include "powernow-k8.h" @@ -536,35 +535,40 @@ static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, u8 for (j = 0; j < data->numps; j++) { if (pst[j].vid > LEAST_VID) { - printk(KERN_ERR PFX "vid %d invalid : 0x%x\n", j, pst[j].vid); + printk(KERN_ERR FW_BUG PFX "vid %d invalid : 0x%x\n", + j, pst[j].vid); return -EINVAL; } if (pst[j].vid < data->rvo) { /* vid + rvo >= 0 */ - printk(KERN_ERR BFX "0 vid exceeded with pstate %d\n", j); + printk(KERN_ERR FW_BUG PFX "0 vid exceeded with pstate" + " %d\n", j); return -ENODEV; } if (pst[j].vid < maxvid + data->rvo) { /* vid + rvo >= maxvid */ - printk(KERN_ERR BFX "maxvid exceeded with pstate %d\n", j); + printk(KERN_ERR FW_BUG PFX "maxvid exceeded with pstate" + " %d\n", j); return -ENODEV; } if (pst[j].fid > MAX_FID) { - printk(KERN_ERR BFX "maxfid exceeded with pstate %d\n", j); + printk(KERN_ERR FW_BUG PFX "maxfid exceeded with pstate" + " %d\n", j); return -ENODEV; } if (j && (pst[j].fid < HI_FID_TABLE_BOTTOM)) { /* Only first fid is allowed to be in "low" range */ - printk(KERN_ERR BFX "two low fids - %d : 0x%x\n", j, pst[j].fid); + printk(KERN_ERR FW_BUG PFX "two low fids - %d : " + "0x%x\n", j, pst[j].fid); return -EINVAL; } if (pst[j].fid < lastfid) lastfid = pst[j].fid; } if (lastfid & 1) { - printk(KERN_ERR BFX "lastfid invalid\n"); + printk(KERN_ERR FW_BUG PFX "lastfid invalid\n"); return -EINVAL; } if (lastfid > LO_FID_TABLE_TOP) - printk(KERN_INFO BFX "first fid not from lo freq table\n"); + printk(KERN_INFO FW_BUG PFX "first fid not from lo freq table\n"); return 0; } @@ -672,13 +676,13 @@ static int find_psb_table(struct powernow_k8_data *data) dprintk("table vers: 0x%x\n", psb->tableversion); if (psb->tableversion != PSB_VERSION_1_4) { - printk(KERN_ERR BFX "PSB table is not v1.4\n"); + printk(KERN_ERR FW_BUG PFX "PSB table is not v1.4\n"); return -ENODEV; } dprintk("flags: 0x%x\n", psb->flags1); if (psb->flags1) { - printk(KERN_ERR BFX "unknown flags\n"); + printk(KERN_ERR FW_BUG PFX "unknown flags\n"); return -ENODEV; } @@ -705,7 +709,7 @@ static int find_psb_table(struct powernow_k8_data *data) } } if (cpst != 1) { - printk(KERN_ERR BFX "numpst must be 1\n"); + printk(KERN_ERR FW_BUG PFX "numpst must be 1\n"); return -ENODEV; } @@ -1130,17 +1134,19 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) "ACPI Processor module before starting this " "driver.\n"); #else - printk(KERN_ERR PFX "Your BIOS does not provide ACPI " - "_PSS objects in a way that Linux understands. " - "Please report this to the Linux ACPI maintainers" - " and complain to your BIOS vendor.\n"); + printk(KERN_ERR FW_BUG PFX "Your BIOS does not provide" + " ACPI _PSS objects in a way that Linux " + "understands. Please report this to the Linux " + "ACPI maintainers and complain to your BIOS " + "vendor.\n"); #endif kfree(data); return -ENODEV; } if (pol->cpu != 0) { - printk(KERN_ERR PFX "No ACPI _PSS objects for CPU other than " - "CPU0. Complain to your BIOS vendor.\n"); + printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for " + "CPU other than CPU0. Complain to your BIOS " + "vendor.\n"); kfree(data); return -ENODEV; } @@ -1193,7 +1199,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) /* min/max the cpu is capable of */ if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) { - printk(KERN_ERR PFX "invalid powernow_table\n"); + printk(KERN_ERR FW_BUG PFX "invalid powernow_table\n"); powernow_k8_cpu_exit_acpi(data); kfree(data->powernow_table); kfree(data); -- cgit v1.2.3-70-g09d2 From 4c168eaf7ea39f25a45a3d8c7eebc3fedb633a1d Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 24 Sep 2008 11:08:52 +0200 Subject: Revert "Oprofile Multiplexing Patch" Reverting commit 1a960b402a51d80abf54e3f8e4972374ffe5f22d for the main branch. Multiplexing will be tracked on a separate feature branch. Conflicts: arch/x86/oprofile/nmi_int.c --- arch/x86/oprofile/nmi_int.c | 100 +++----------------------------------- arch/x86/oprofile/op_counter.h | 3 +- arch/x86/oprofile/op_model_amd.c | 76 ++++++++++++----------------- arch/x86/oprofile/op_model_p4.c | 4 -- arch/x86/oprofile/op_model_ppro.c | 2 - arch/x86/oprofile/op_x86_model.h | 3 -- drivers/oprofile/oprof.c | 58 ++-------------------- drivers/oprofile/oprof.h | 4 +- drivers/oprofile/oprofile_files.c | 39 +-------------- include/linux/oprofile.h | 3 -- 10 files changed, 45 insertions(+), 247 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 4108d02c529..287513a0981 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -23,18 +23,12 @@ #include "op_counter.h" #include "op_x86_model.h" -DEFINE_PER_CPU(int, switch_index); - static struct op_x86_model_spec const *model; static DEFINE_PER_CPU(struct op_msrs, cpu_msrs); static DEFINE_PER_CPU(unsigned long, saved_lvtpc); static int nmi_start(void); static void nmi_stop(void); -static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs); -static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs); -static void nmi_cpu_stop(void *dummy); -static void nmi_cpu_start(void *dummy); /* 0 == registered but off, 1 == registered and on */ static int nmi_enabled = 0; @@ -87,47 +81,6 @@ static void exit_sysfs(void) #define exit_sysfs() do { } while (0) #endif /* CONFIG_PM */ -static void nmi_cpu_switch(void *dummy) -{ - int cpu = smp_processor_id(); - int si = per_cpu(switch_index, cpu); - struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); - - nmi_cpu_stop(NULL); - nmi_cpu_save_mpx_registers(msrs); - - /* move to next set */ - si += model->num_hardware_counters; - if ((si > model->num_counters) || (counter_config[si].count == 0)) - per_cpu(switch_index, smp_processor_id()) = 0; - else - per_cpu(switch_index, smp_processor_id()) = si; - - nmi_cpu_restore_mpx_registers(msrs); - model->setup_ctrs(msrs); - nmi_cpu_start(NULL); -} - -/* - * Quick check to see if multiplexing is necessary. - * The check should be sufficient since counters are used - * in ordre. - */ -static int nmi_multiplex_on(void) -{ - return counter_config[model->num_hardware_counters].count ? 0 : -EINVAL; -} - -static int nmi_switch_event(void) -{ - if (nmi_multiplex_on() < 0) - return -EINVAL; - - on_each_cpu(nmi_cpu_switch, NULL, 1); - - return 0; -} - static int profile_exceptions_notify(struct notifier_block *self, unsigned long val, void *data) { @@ -191,10 +144,11 @@ static void free_msrs(void) static int allocate_msrs(void) { - int i, success = 1; + int success = 1; size_t controls_size = sizeof(struct op_msr) * model->num_controls; size_t counters_size = sizeof(struct op_msr) * model->num_counters; + int i; for_each_possible_cpu(i) { per_cpu(cpu_msrs, i).counters = kmalloc(counters_size, GFP_KERNEL); @@ -202,8 +156,8 @@ static int allocate_msrs(void) success = 0; break; } - per_cpu(cpu_msrs, i).controls = - kmalloc(controls_size, GFP_KERNEL); + per_cpu(cpu_msrs, i).controls = kmalloc(controls_size, + GFP_KERNEL); if (!per_cpu(cpu_msrs, i).controls) { success = 0; break; @@ -247,8 +201,7 @@ static int nmi_setup(void) return err; } - /* - * We need to serialize save and setup for HT because the subset + /* We need to serialize save and setup for HT because the subset * of msrs are distinct for save and setup operations */ @@ -264,6 +217,7 @@ static int nmi_setup(void) per_cpu(cpu_msrs, 0).controls, sizeof(struct op_msr) * model->num_controls); } + } on_each_cpu(nmi_save_registers, NULL, 1); on_each_cpu(nmi_cpu_setup, NULL, 1); @@ -271,41 +225,7 @@ static int nmi_setup(void) return 0; } -static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs) -{ - unsigned int si = __get_cpu_var(switch_index); - unsigned int const nr_ctrs = model->num_hardware_counters; - struct op_msr *counters = &msrs->counters[si]; - unsigned int i; - - for (i = 0; i < nr_ctrs; ++i) { - int offset = i + si; - if (counters[offset].addr) { - rdmsr(counters[offset].addr, - counters[offset].multiplex.low, - counters[offset].multiplex.high); - } - } -} - -static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs) -{ - unsigned int si = __get_cpu_var(switch_index); - unsigned int const nr_ctrs = model->num_hardware_counters; - struct op_msr *counters = &msrs->counters[si]; - unsigned int i; - - for (i = 0; i < nr_ctrs; ++i) { - int offset = i + si; - if (counters[offset].addr) { - wrmsr(counters[offset].addr, - counters[offset].multiplex.low, - counters[offset].multiplex.high); - } - } -} - -static void nmi_cpu_restore_registers(struct op_msrs *msrs) +static void nmi_restore_registers(struct op_msrs *msrs) { unsigned int const nr_ctrs = model->num_counters; unsigned int const nr_ctrls = model->num_controls; @@ -345,8 +265,7 @@ static void nmi_cpu_shutdown(void *dummy) apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu)); apic_write(APIC_LVTERR, v); - nmi_cpu_restore_registers(msrs); - __get_cpu_var(switch_index) = 0; + nmi_restore_registers(msrs); } static void nmi_shutdown(void) @@ -409,7 +328,6 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root) oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask); oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel); oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user); - counter_config[i].save_count_low = 0; } return 0; @@ -551,14 +469,12 @@ int __init op_nmi_init(struct oprofile_operations *ops) } /* default values, can be overwritten by model */ - __raw_get_cpu_var(switch_index) = 0; ops->create_files = nmi_create_files; ops->setup = nmi_setup; ops->shutdown = nmi_shutdown; ops->start = nmi_start; ops->stop = nmi_stop; ops->cpu_type = cpu_type; - ops->switch_events = nmi_switch_event; if (model->init) ret = model->init(ops); diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h index 786d6e01cf7..2880b15c467 100644 --- a/arch/x86/oprofile/op_counter.h +++ b/arch/x86/oprofile/op_counter.h @@ -10,14 +10,13 @@ #ifndef OP_COUNTER_H #define OP_COUNTER_H -#define OP_MAX_COUNTER 32 +#define OP_MAX_COUNTER 8 /* Per-perfctr configuration as set via * oprofilefs. */ struct op_counter_config { unsigned long count; - unsigned long save_count_low; unsigned long enabled; unsigned long event; unsigned long kernel; diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index bbf2b68bcc5..d9faf607b3a 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include @@ -24,10 +23,8 @@ #include "op_x86_model.h" #include "op_counter.h" -#define NUM_COUNTERS 32 -#define NUM_HARDWARE_COUNTERS 4 -#define NUM_CONTROLS 32 -#define NUM_HARDWARE_CONTROLS 4 +#define NUM_COUNTERS 4 +#define NUM_CONTROLS 4 #define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) #define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0) @@ -51,7 +48,6 @@ #define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8)) static unsigned long reset_value[NUM_COUNTERS]; -DECLARE_PER_CPU(int, switch_index); #ifdef CONFIG_OPROFILE_IBS @@ -134,17 +130,15 @@ static void op_amd_fill_in_addresses(struct op_msrs * const msrs) int i; for (i = 0; i < NUM_COUNTERS; i++) { - int hw_counter = i % NUM_HARDWARE_COUNTERS; - if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + hw_counter)) - msrs->counters[i].addr = MSR_K7_PERFCTR0 + hw_counter; + if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) + msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; else msrs->counters[i].addr = 0; } for (i = 0; i < NUM_CONTROLS; i++) { - int hw_control = i % NUM_HARDWARE_CONTROLS; - if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + hw_control)) - msrs->controls[i].addr = MSR_K7_EVNTSEL0 + hw_control; + if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) + msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; else msrs->controls[i].addr = 0; } @@ -156,16 +150,8 @@ static void op_amd_setup_ctrs(struct op_msrs const * const msrs) unsigned int low, high; int i; - for (i = 0; i < NUM_HARDWARE_CONTROLS; ++i) { - int offset = i + __get_cpu_var(switch_index); - if (counter_config[offset].enabled) - reset_value[offset] = counter_config[offset].count; - else - reset_value[offset] = 0; - } - /* clear all counters */ - for (i = 0 ; i < NUM_HARDWARE_CONTROLS; ++i) { + for (i = 0 ; i < NUM_CONTROLS; ++i) { if (unlikely(!CTRL_IS_RESERVED(msrs, i))) continue; CTRL_READ(low, high, msrs, i); @@ -175,31 +161,34 @@ static void op_amd_setup_ctrs(struct op_msrs const * const msrs) } /* avoid a false detection of ctr overflows in NMI handler */ - for (i = 0; i < NUM_HARDWARE_COUNTERS; ++i) { + for (i = 0; i < NUM_COUNTERS; ++i) { if (unlikely(!CTR_IS_RESERVED(msrs, i))) continue; CTR_WRITE(1, msrs, i); } /* enable active counters */ - for (i = 0; i < NUM_HARDWARE_COUNTERS; ++i) { - int offset = i + __get_cpu_var(switch_index); - if ((counter_config[offset].enabled) && (CTR_IS_RESERVED(msrs, i))) { - CTR_WRITE(counter_config[offset].count, msrs, i); + for (i = 0; i < NUM_COUNTERS; ++i) { + if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { + reset_value[i] = counter_config[i].count; + + CTR_WRITE(counter_config[i].count, msrs, i); CTRL_READ(low, high, msrs, i); CTRL_CLEAR_LO(low); CTRL_CLEAR_HI(high); CTRL_SET_ENABLE(low); - CTRL_SET_USR(low, counter_config[offset].user); - CTRL_SET_KERN(low, counter_config[offset].kernel); - CTRL_SET_UM(low, counter_config[offset].unit_mask); - CTRL_SET_EVENT_LOW(low, counter_config[offset].event); - CTRL_SET_EVENT_HIGH(high, counter_config[offset].event); + CTRL_SET_USR(low, counter_config[i].user); + CTRL_SET_KERN(low, counter_config[i].kernel); + CTRL_SET_UM(low, counter_config[i].unit_mask); + CTRL_SET_EVENT_LOW(low, counter_config[i].event); + CTRL_SET_EVENT_HIGH(high, counter_config[i].event); CTRL_SET_HOST_ONLY(high, 0); CTRL_SET_GUEST_ONLY(high, 0); CTRL_WRITE(low, high, msrs, i); + } else { + reset_value[i] = 0; } } } @@ -287,14 +276,13 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, unsigned int low, high; int i; - for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) { - int offset = i + __get_cpu_var(switch_index); - if (!reset_value[offset]) + for (i = 0 ; i < NUM_COUNTERS; ++i) { + if (!reset_value[i]) continue; CTR_READ(low, high, msrs, i); if (CTR_OVERFLOWED(low)) { - oprofile_add_sample(regs, offset); - CTR_WRITE(reset_value[offset], msrs, i); + oprofile_add_sample(regs, i); + CTR_WRITE(reset_value[i], msrs, i); } } @@ -310,10 +298,8 @@ static void op_amd_start(struct op_msrs const * const msrs) { unsigned int low, high; int i; - - for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) { - int offset = i + __get_cpu_var(switch_index); - if (reset_value[offset]) { + for (i = 0 ; i < NUM_COUNTERS ; ++i) { + if (reset_value[i]) { CTRL_READ(low, high, msrs, i); CTRL_SET_ACTIVE(low); CTRL_WRITE(low, high, msrs, i); @@ -343,8 +329,8 @@ static void op_amd_stop(struct op_msrs const * const msrs) /* Subtle: stop on all counters to avoid race with * setting our pm callback */ - for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) { - if (!reset_value[i + per_cpu(switch_index, smp_processor_id())]) + for (i = 0 ; i < NUM_COUNTERS ; ++i) { + if (!reset_value[i]) continue; CTRL_READ(low, high, msrs, i); CTRL_SET_INACTIVE(low); @@ -370,11 +356,11 @@ static void op_amd_shutdown(struct op_msrs const * const msrs) { int i; - for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) { + for (i = 0 ; i < NUM_COUNTERS ; ++i) { if (CTR_IS_RESERVED(msrs, i)) release_perfctr_nmi(MSR_K7_PERFCTR0 + i); } - for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) { + for (i = 0 ; i < NUM_CONTROLS ; ++i) { if (CTRL_IS_RESERVED(msrs, i)) release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); } @@ -548,8 +534,6 @@ struct op_x86_model_spec const op_amd_spec = { .exit = op_amd_exit, .num_counters = NUM_COUNTERS, .num_controls = NUM_CONTROLS, - .num_hardware_counters = NUM_HARDWARE_COUNTERS, - .num_hardware_controls = NUM_HARDWARE_CONTROLS, .fill_in_addresses = &op_amd_fill_in_addresses, .setup_ctrs = &op_amd_setup_ctrs, .check_ctrs = &op_amd_check_ctrs, diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index cacba61ffba..43ac5af338d 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -700,8 +700,6 @@ static void p4_shutdown(struct op_msrs const * const msrs) struct op_x86_model_spec const op_p4_ht2_spec = { .num_counters = NUM_COUNTERS_HT2, .num_controls = NUM_CONTROLS_HT2, - .num_hardware_counters = NUM_COUNTERS_HT2, - .num_hardware_controls = NUM_CONTROLS_HT2, .fill_in_addresses = &p4_fill_in_addresses, .setup_ctrs = &p4_setup_ctrs, .check_ctrs = &p4_check_ctrs, @@ -714,8 +712,6 @@ struct op_x86_model_spec const op_p4_ht2_spec = { struct op_x86_model_spec const op_p4_spec = { .num_counters = NUM_COUNTERS_NON_HT, .num_controls = NUM_CONTROLS_NON_HT, - .num_hardware_counters = NUM_COUNTERS_NON_HT, - .num_hardware_controls = NUM_CONTROLS_NON_HT, .fill_in_addresses = &p4_fill_in_addresses, .setup_ctrs = &p4_setup_ctrs, .check_ctrs = &p4_check_ctrs, diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index e5811aa480e..eff431f6c57 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -183,8 +183,6 @@ static void ppro_shutdown(struct op_msrs const * const msrs) struct op_x86_model_spec const op_ppro_spec = { .num_counters = NUM_COUNTERS, .num_controls = NUM_CONTROLS, - .num_hardware_counters = NUM_COUNTERS, - .num_hardware_controls = NUM_CONTROLS, .fill_in_addresses = &ppro_fill_in_addresses, .setup_ctrs = &ppro_setup_ctrs, .check_ctrs = &ppro_check_ctrs, diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index e07ba107637..05a0261ba0c 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -19,7 +19,6 @@ struct op_saved_msr { struct op_msr { unsigned long addr; struct op_saved_msr saved; - struct op_saved_msr multiplex; }; struct op_msrs { @@ -35,8 +34,6 @@ struct pt_regs; struct op_x86_model_spec { int (*init)(struct oprofile_operations *ops); void (*exit)(void); - unsigned int const num_hardware_counters; - unsigned int const num_hardware_controls; unsigned int const num_counters; unsigned int const num_controls; void (*fill_in_addresses)(struct op_msrs * const msrs); diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c index b2fa5df64a6..2c645170f06 100644 --- a/drivers/oprofile/oprof.c +++ b/drivers/oprofile/oprof.c @@ -12,8 +12,6 @@ #include #include #include -#include -#include #include #include "oprof.h" @@ -21,18 +19,13 @@ #include "cpu_buffer.h" #include "buffer_sync.h" #include "oprofile_stats.h" - -static unsigned long is_setup; -static void switch_worker(struct work_struct *work); -static DECLARE_DELAYED_WORK(switch_work, switch_worker); -static DEFINE_MUTEX(start_mutex); struct oprofile_operations oprofile_ops; -unsigned long timeout_jiffies; unsigned long oprofile_started; unsigned long backtrace_depth; -/* Multiplexing defaults at 1 msec*/ +static unsigned long is_setup; +static DEFINE_MUTEX(start_mutex); /* timer 0 - use performance monitoring hardware if available @@ -94,16 +87,6 @@ out: return err; } -static void start_switch_worker(void) -{ - schedule_delayed_work(&switch_work, timeout_jiffies); -} - -static void switch_worker(struct work_struct *work) -{ - if (!oprofile_ops.switch_events()) - start_switch_worker(); -} /* Actually start profiling (echo 1>/dev/oprofile/enable) */ int oprofile_start(void) @@ -111,6 +94,7 @@ int oprofile_start(void) int err = -EINVAL; mutex_lock(&start_mutex); + if (!is_setup) goto out; @@ -124,9 +108,6 @@ int oprofile_start(void) if ((err = oprofile_ops.start())) goto out; - if (oprofile_ops.switch_events) - start_switch_worker(); - oprofile_started = 1; out: mutex_unlock(&start_mutex); @@ -142,7 +123,6 @@ void oprofile_stop(void) goto out; oprofile_ops.stop(); oprofile_started = 0; - cancel_delayed_work_sync(&switch_work); /* wake up the daemon to read what remains */ wake_up_buffer_waiter(); out: @@ -175,32 +155,6 @@ post_sync: mutex_unlock(&start_mutex); } -/* User inputs in ms, converts to jiffies */ -int oprofile_set_timeout(unsigned long val_msec) -{ - int err = 0; - - mutex_lock(&start_mutex); - - if (oprofile_started) { - err = -EBUSY; - goto out; - } - - if (!oprofile_ops.switch_events) { - err = -EINVAL; - goto out; - } - - timeout_jiffies = msecs_to_jiffies(val_msec); - if (timeout_jiffies == MAX_JIFFY_OFFSET) - timeout_jiffies = msecs_to_jiffies(1); - -out: - mutex_unlock(&start_mutex); - return err; - -} int oprofile_set_backtrace(unsigned long val) { @@ -225,16 +179,10 @@ out: return err; } -static void __init oprofile_switch_timer_init(void) -{ - timeout_jiffies = msecs_to_jiffies(1); -} - static int __init oprofile_init(void) { int err; - oprofile_switch_timer_init(); err = oprofile_arch_init(&oprofile_ops); if (err < 0 || timer) { diff --git a/drivers/oprofile/oprof.h b/drivers/oprofile/oprof.h index c4406a7366b..18323650806 100644 --- a/drivers/oprofile/oprof.h +++ b/drivers/oprofile/oprof.h @@ -27,8 +27,7 @@ extern unsigned long fs_buffer_watershed; extern struct oprofile_operations oprofile_ops; extern unsigned long oprofile_started; extern unsigned long backtrace_depth; -extern unsigned long timeout_jiffies; - + struct super_block; struct dentry; @@ -36,6 +35,5 @@ void oprofile_create_files(struct super_block * sb, struct dentry * root); void oprofile_timer_init(struct oprofile_operations * ops); int oprofile_set_backtrace(unsigned long depth); -int oprofile_set_timeout(unsigned long time); #endif /* OPROF_H */ diff --git a/drivers/oprofile/oprofile_files.c b/drivers/oprofile/oprofile_files.c index cc4f5a1f8ef..ef953ba5ab6 100644 --- a/drivers/oprofile/oprofile_files.c +++ b/drivers/oprofile/oprofile_files.c @@ -9,7 +9,6 @@ #include #include -#include #include "event_buffer.h" #include "oprofile_stats.h" @@ -19,40 +18,6 @@ unsigned long fs_buffer_size = 131072; unsigned long fs_cpu_buffer_size = 8192; unsigned long fs_buffer_watershed = 32768; /* FIXME: tune */ -static ssize_t timeout_read(struct file *file, char __user *buf, - size_t count, loff_t *offset) -{ - return oprofilefs_ulong_to_user(jiffies_to_msecs(timeout_jiffies), - buf, count, offset); -} - - -static ssize_t timeout_write(struct file *file, char const __user *buf, - size_t count, loff_t *offset) -{ - unsigned long val; - int retval; - - if (*offset) - return -EINVAL; - - retval = oprofilefs_ulong_from_user(&val, buf, count); - if (retval) - return retval; - - retval = oprofile_set_timeout(val); - - if (retval) - return retval; - return count; -} - -static const struct file_operations timeout_fops = { - .read = timeout_read, - .write = timeout_write, -}; - - static ssize_t depth_read(struct file * file, char __user * buf, size_t count, loff_t * offset) { return oprofilefs_ulong_to_user(backtrace_depth, buf, count, offset); @@ -120,10 +85,11 @@ static ssize_t enable_write(struct file * file, char const __user * buf, size_t if (*offset) return -EINVAL; + retval = oprofilefs_ulong_from_user(&val, buf, count); if (retval) return retval; - + if (val) retval = oprofile_start(); else @@ -163,7 +129,6 @@ void oprofile_create_files(struct super_block * sb, struct dentry * root) oprofilefs_create_file(sb, root, "cpu_type", &cpu_type_fops); oprofilefs_create_file(sb, root, "backtrace_depth", &depth_fops); oprofilefs_create_file(sb, root, "pointer_size", &pointer_size_fops); - oprofilefs_create_file(sb, root, "timeout_ms", &timeout_fops); oprofile_create_stats_files(sb, root); if (oprofile_ops.create_files) oprofile_ops.create_files(sb, root); diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h index 687f2f4c36a..bcb8f725427 100644 --- a/include/linux/oprofile.h +++ b/include/linux/oprofile.h @@ -67,9 +67,6 @@ struct oprofile_operations { /* Initiate a stack backtrace. Optional. */ void (*backtrace)(struct pt_regs * const regs, unsigned int depth); - - /* Multiplex between different events. Optional. */ - int (*switch_events)(void); /* CPU identification string. */ char * cpu_type; }; -- cgit v1.2.3-70-g09d2 From 379daf6290814e41f14880094b7b773640df2461 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 25 Sep 2008 18:43:34 -0700 Subject: IO resources, x86: ioremap sanity check to catch mapping requests exceeding the BAR sizes Go through the iomem resource tree to check if any of the ioremap() requests span more than any slot in the iomem resource tree and do a WARN_ON() if we hit this check. This will raise a red-flag, if some driver is mapping more than what is needed. And hopefully identify possible corruptions much earlier. Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 6 ++++++ include/linux/ioport.h | 1 + kernel/resource.c | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index d4b6e6a29ae..c818b45bd07 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -149,6 +149,12 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, if (is_ISA_range(phys_addr, last_addr)) return (__force void __iomem *)phys_to_virt(phys_addr); + /* + * Check if the request spans more than any BAR in the iomem resource + * tree. + */ + WARN_ON(iomem_map_sanity_check(phys_addr, size)); + /* * Don't allow anybody to remap normal RAM that we're using.. */ diff --git a/include/linux/ioport.h b/include/linux/ioport.h index fded376b94e..01712cf1a38 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -169,6 +169,7 @@ extern struct resource * __devm_request_region(struct device *dev, extern void __devm_release_region(struct device *dev, struct resource *parent, resource_size_t start, resource_size_t n); +extern int iomem_map_sanity_check(resource_size_t addr, unsigned long size); #endif /* __ASSEMBLY__ */ #endif /* _LINUX_IOPORT_H */ diff --git a/kernel/resource.c b/kernel/resource.c index fc59dcc4795..1d003a50ee1 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -827,3 +827,36 @@ static int __init reserve_setup(char *str) } __setup("reserve=", reserve_setup); + +/* + * Check if the requested addr and size spans more than any slot in the + * iomem resource tree. + */ +int iomem_map_sanity_check(resource_size_t addr, unsigned long size) +{ + struct resource *p = &iomem_resource; + int err = 0; + loff_t l; + + read_lock(&resource_lock); + for (p = p->child; p ; p = r_next(NULL, p, &l)) { + /* + * We can probably skip the resources without + * IORESOURCE_IO attribute? + */ + if (p->start >= addr + size) + continue; + if (p->end < addr) + continue; + if (p->start <= addr && (p->end >= addr + size - 1)) + continue; + printk(KERN_WARNING "resource map sanity check conflict: " + "0x%llx 0x%llx 0x%llx 0x%llx %s\n", + addr, addr + size - 1, p->start, p->end, p->name); + err = -1; + break; + } + read_unlock(&resource_lock); + + return err; +} -- cgit v1.2.3-70-g09d2 From d0d0f7432c9cbd52cb2f31d499f8292b13a7ecac Mon Sep 17 00:00:00 2001 From: Matt Mackall Date: Thu, 9 Oct 2008 12:41:50 -0500 Subject: x86: remove magic number from ACPI sleep stack buffer x86_64 SMP suspend to RAM uses a 10k temporary stack for saving the kernel state, but only 4k of it is used. Shrink it to 4k. Signed-off-by: Matt Mackall Acked-by: Pavel Machek Signed-off-by: Len Brown --- arch/x86/kernel/acpi/sleep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 426e5d91b63..29cf3403abe 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -97,7 +97,7 @@ int acpi_save_state_mem(void) #else /* CONFIG_64BIT */ header->trampoline_segment = setup_trampoline() >> 4; #ifdef CONFIG_SMP - stack_start.sp = temp_stack + 4096; + stack_start.sp = temp_stack + sizeof(temp_stack); #endif initial_code = (unsigned long)wakeup_long64; saved_magic = 0x123456789abcdef0; -- cgit v1.2.3-70-g09d2 From 5000cadcf3188e935dae28c4fc7e24639704ea55 Mon Sep 17 00:00:00 2001 From: Matt Mackall Date: Thu, 9 Oct 2008 11:56:21 -0500 Subject: x86: trim ACPI sleep stack buffer x86_64 SMP suspend to RAM uses a 10k temporary stack for saving the kernel state, but only 4k of it is used. Shrink it to 4k. Signed-off-by: Matt Mackall Signed-off-by: Len Brown --- arch/x86/kernel/acpi/sleep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 29cf3403abe..55d10cbe65b 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -21,7 +21,7 @@ unsigned long acpi_realmode_flags; static unsigned long acpi_realmode; #if defined(CONFIG_SMP) && defined(CONFIG_64BIT) -static char temp_stack[10240]; +static char temp_stack[4096]; #endif /** -- cgit v1.2.3-70-g09d2 From ee297533279a802eac8b1cbea8e65b24b36a1aac Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 24 Sep 2008 19:04:31 -0700 Subject: ACPI: don't load acpi_cpufreq if acpi=off Signed-off-by: Yinghai Lu Signed-off-by: Len Brown --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index dd097b83583..9943b4c8774 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -779,6 +779,9 @@ static int __init acpi_cpufreq_init(void) { int ret; + if (acpi_disabled) + return 0; + dprintk("acpi_cpufreq_init\n"); ret = acpi_cpufreq_early_init(); -- cgit v1.2.3-70-g09d2 From 891cffbd6bcba26409869c19c07ecd4bfc0c2460 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 12 Oct 2008 13:16:12 -0700 Subject: x86/mm: do not trigger a kernel warning if user-space disables interrupts and generates a page fault Arjan reported a spike in the following bug pattern in v2.6.27: http://www.kerneloops.org/searchweek.php?search=lock_page which happens because hwclock started triggering warnings due to a (correct) might_sleep() check in the MM code. The warning occurs because hwclock uses this dubious sequence of code to run "atomic" code: static unsigned long atomic(const char *name, unsigned long (*op)(unsigned long), unsigned long arg) { unsigned long v; __asm__ volatile ("cli"); v = (*op)(arg); __asm__ volatile ("sti"); return v; } Then it pagefaults in that "atomic" section, triggering the warning. There is no way the kernel could provide "atomicity" in this path, a page fault is a cannot-continue machine event so the kernel has to wait for the page to be filled in. Even if it was just a minor fault we'd have to take locks and might have to spend quite a bit of time with interrupts disabled - not nice to irq latencies in general. So instead just enable interrupts in the pagefault path unconditionally if we come from user-space, and handle the fault. Also, while touching this code, unify some trivial parts of the x86 VM paths at the same time. Signed-off-by: Linus Torvalds Reported-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a742d753d5b..ac2ad781da0 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -645,24 +645,23 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) } -#ifdef CONFIG_X86_32 - /* It's safe to allow irq's after cr2 has been saved and the vmalloc - fault has been handled. */ - if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK)) - local_irq_enable(); - /* - * If we're in an interrupt, have no user context or are running in an - * atomic region then we must not take the fault. + * It's safe to allow irq's after cr2 has been saved and the + * vmalloc fault has been handled. + * + * User-mode registers count as a user access even for any + * potential system fault or CPU buglet. */ - if (in_atomic() || !mm) - goto bad_area_nosemaphore; -#else /* CONFIG_X86_64 */ - if (likely(regs->flags & X86_EFLAGS_IF)) + if (user_mode_vm(regs)) { + local_irq_enable(); + error_code |= PF_USER; + } else if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); +#ifdef CONFIG_X86_64 if (unlikely(error_code & PF_RSVD)) pgtable_bad(address, regs, error_code); +#endif /* * If we're in an interrupt, have no user context or are running in an @@ -671,14 +670,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) if (unlikely(in_atomic() || !mm)) goto bad_area_nosemaphore; - /* - * User-mode registers count as a user access even for any - * potential system fault or CPU buglet. - */ - if (user_mode_vm(regs)) - error_code |= PF_USER; again: -#endif /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the * kernel and should generate an OOPS. Unfortunately, in the case of an -- cgit v1.2.3-70-g09d2 From 3a1dfe6eefe483589c99c909202ffe1a20d589b5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 13 Oct 2008 17:49:02 +0200 Subject: x86/mm: unify init task OOM handling Linus noticed that the "again:" versus "survive:" OOM logic for the init task was arbitrarily different. The 64-bit codepath is the better one, because it correctly re-lookups the vma after having dropped the ->mmap_sem. Signed-off-by: Ingo Molnar Acked-by: Linus Torvalds --- arch/x86/mm/fault.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index ac2ad781da0..8bc5956e1af 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -671,7 +671,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) goto bad_area_nosemaphore; again: - /* When running in the kernel we expect faults to occur only to + /* + * When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the * kernel and should generate an OOPS. Unfortunately, in the case of an * erroneous fault occurring in a code path which already holds mmap_sem @@ -734,9 +735,6 @@ good_area: goto bad_area; } -#ifdef CONFIG_X86_32 -survive: -#endif /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo @@ -871,12 +869,11 @@ out_of_memory: up_read(&mm->mmap_sem); if (is_global_init(tsk)) { yield(); -#ifdef CONFIG_X86_32 - down_read(&mm->mmap_sem); - goto survive; -#else + /* + * Re-lookup the vma - in theory the vma tree might + * have changed: + */ goto again; -#endif } printk("VM: killing process %s\n", tsk->comm); -- cgit v1.2.3-70-g09d2 From 5d4488027d9cf3161c71566dfabb116bf69ab4d9 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 18 Aug 2008 14:49:47 +0200 Subject: oprofile: drop const in num counters field allow to modify it at runtime Signed-off-by: Andi Kleen Signed-off-by: Robert Richter --- arch/x86/oprofile/op_x86_model.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 05a0261ba0c..3d3b85d3c25 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -34,8 +34,8 @@ struct pt_regs; struct op_x86_model_spec { int (*init)(struct oprofile_operations *ops); void (*exit)(void); - unsigned int const num_counters; - unsigned int const num_controls; + unsigned int num_counters; + unsigned int num_controls; void (*fill_in_addresses)(struct op_msrs * const msrs); void (*setup_ctrs)(struct op_msrs const * const msrs); int (*check_ctrs)(struct pt_regs * const regs, -- cgit v1.2.3-70-g09d2 From f645f6406463a01869c50844befc76d528971690 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 20 Aug 2008 17:22:02 +0200 Subject: oprofile: Don't report Nehalem as core_2 This essentially reverts Linus' earlier 4b9f12a3779c548b68bc9af7d94030868ad3aa1b commit. Nehalem is not core_2, so it shouldn't be reported as such. However with the earlier arch perfmon patch it will fall back to arch perfmon mode now, so there is no need to fake it as core_2. The only drawback is that Linus will need to patch the arch perfmon support into his oprofile binary now, but I think he can do that. Signed-off-by: Andi Kleen Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 57f6c908808..1059f3fe6b1 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -420,9 +420,6 @@ static int __init ppro_init(char **cpu_type) case 15: case 23: *cpu_type = "i386/core_2"; break; - case 26: - *cpu_type = "i386/core_2"; - break; default: /* Unknown */ return 0; -- cgit v1.2.3-70-g09d2 From b99170288421c79f0c2efa8b33e26e65f4bb7fb8 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 18 Aug 2008 14:50:31 +0200 Subject: oprofile: Implement Intel architectural perfmon support Newer Intel CPUs (Core1+) have support for architectural events described in CPUID 0xA. See the IA32 SDM Vol3b.18 for details. The advantage of this is that it can be done without knowing about the specific CPU, because the CPU describes by itself what performance events are supported. This is only a fallback because only a limited set of 6 events are supported. This allows to do profiling on Nehalem and on Atom systems (later not tested) This patch implements support for that in oprofile's Intel Family 6 profiling module. It also has the advantage of supporting an arbitary number of events now as reported by the CPU. Also allow arbitary counter widths >32bit while we're at it. Requires a patched oprofile userland to support the new architecture. v2: update for latest oprofile tree remove force_arch_perfmon Signed-off-by: Andi Kleen Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 23 ++++++--- arch/x86/oprofile/op_model_ppro.c | 104 ++++++++++++++++++++++++++++++-------- arch/x86/oprofile/op_x86_model.h | 3 ++ 3 files changed, 102 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 1059f3fe6b1..12d6f85084f 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -429,6 +429,16 @@ static int __init ppro_init(char **cpu_type) return 1; } +static int __init arch_perfmon_init(char **cpu_type) +{ + if (!cpu_has_arch_perfmon) + return 0; + *cpu_type = "i386/arch_perfmon"; + model = &op_arch_perfmon_spec; + arch_perfmon_setup_counters(); + return 1; +} + /* in order to get sysfs right */ static int using_nmi; @@ -436,7 +446,7 @@ int __init op_nmi_init(struct oprofile_operations *ops) { __u8 vendor = boot_cpu_data.x86_vendor; __u8 family = boot_cpu_data.x86; - char *cpu_type; + char *cpu_type = NULL; int ret = 0; if (!cpu_has_apic) @@ -474,19 +484,20 @@ int __init op_nmi_init(struct oprofile_operations *ops) switch (family) { /* Pentium IV */ case 0xf: - if (!p4_init(&cpu_type)) - return -ENODEV; + p4_init(&cpu_type); break; /* A P6-class processor */ case 6: - if (!ppro_init(&cpu_type)) - return -ENODEV; + ppro_init(&cpu_type); break; default: - return -ENODEV; + break; } + + if (!cpu_type && !arch_perfmon_init(&cpu_type)) + return -ENODEV; break; default: diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index eff431f6c57..12e207a67f1 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -1,32 +1,34 @@ /* * @file op_model_ppro.h - * pentium pro / P6 model-specific MSR operations + * Family 6 perfmon and architectural perfmon MSR operations * * @remark Copyright 2002 OProfile authors + * @remark Copyright 2008 Intel Corporation * @remark Read the file COPYING * * @author John Levon * @author Philippe Elie * @author Graydon Hoare + * @author Andi Kleen */ #include +#include #include #include #include #include +#include #include "op_x86_model.h" #include "op_counter.h" -#define NUM_COUNTERS 2 -#define NUM_CONTROLS 2 +static int num_counters = 2; +static int counter_width = 32; #define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) #define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0) -#define CTR_32BIT_WRITE(l, msrs, c) \ - do {wrmsr(msrs->counters[(c)].addr, -(u32)(l), 0); } while (0) -#define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) +#define CTR_OVERFLOWED(n) (!((n) & (1U<<(counter_width-1)))) #define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) #define CTRL_READ(l, h, msrs, c) do {rdmsr((msrs->controls[(c)].addr), (l), (h)); } while (0) @@ -40,20 +42,20 @@ #define CTRL_SET_UM(val, m) (val |= (m << 8)) #define CTRL_SET_EVENT(val, e) (val |= e) -static unsigned long reset_value[NUM_COUNTERS]; +static u64 *reset_value; static void ppro_fill_in_addresses(struct op_msrs * const msrs) { int i; - for (i = 0; i < NUM_COUNTERS; i++) { + for (i = 0; i < num_counters; i++) { if (reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i)) msrs->counters[i].addr = MSR_P6_PERFCTR0 + i; else msrs->counters[i].addr = 0; } - for (i = 0; i < NUM_CONTROLS; i++) { + for (i = 0; i < num_counters; i++) { if (reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i; else @@ -67,8 +69,22 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs) unsigned int low, high; int i; + if (!reset_value) { + reset_value = kmalloc(sizeof(unsigned) * num_counters, + GFP_ATOMIC); + if (!reset_value) + return; + } + + if (cpu_has_arch_perfmon) { + union cpuid10_eax eax; + eax.full = cpuid_eax(0xa); + if (counter_width < eax.split.bit_width) + counter_width = eax.split.bit_width; + } + /* clear all counters */ - for (i = 0 ; i < NUM_CONTROLS; ++i) { + for (i = 0 ; i < num_counters; ++i) { if (unlikely(!CTRL_IS_RESERVED(msrs, i))) continue; CTRL_READ(low, high, msrs, i); @@ -77,18 +93,18 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs) } /* avoid a false detection of ctr overflows in NMI handler */ - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { if (unlikely(!CTR_IS_RESERVED(msrs, i))) continue; - CTR_32BIT_WRITE(1, msrs, i); + wrmsrl(msrs->counters[i].addr, -1LL); } /* enable active counters */ - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { reset_value[i] = counter_config[i].count; - CTR_32BIT_WRITE(counter_config[i].count, msrs, i); + wrmsrl(msrs->counters[i].addr, -reset_value[i]); CTRL_READ(low, high, msrs, i); CTRL_CLEAR(low); @@ -111,13 +127,13 @@ static int ppro_check_ctrs(struct pt_regs * const regs, unsigned int low, high; int i; - for (i = 0 ; i < NUM_COUNTERS; ++i) { + for (i = 0 ; i < num_counters; ++i) { if (!reset_value[i]) continue; CTR_READ(low, high, msrs, i); if (CTR_OVERFLOWED(low)) { oprofile_add_sample(regs, i); - CTR_32BIT_WRITE(reset_value[i], msrs, i); + wrmsrl(msrs->counters[i].addr, -reset_value[i]); } } @@ -141,7 +157,7 @@ static void ppro_start(struct op_msrs const * const msrs) unsigned int low, high; int i; - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { if (reset_value[i]) { CTRL_READ(low, high, msrs, i); CTRL_SET_ACTIVE(low); @@ -156,7 +172,7 @@ static void ppro_stop(struct op_msrs const * const msrs) unsigned int low, high; int i; - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { if (!reset_value[i]) continue; CTRL_READ(low, high, msrs, i); @@ -169,21 +185,65 @@ static void ppro_shutdown(struct op_msrs const * const msrs) { int i; - for (i = 0 ; i < NUM_COUNTERS ; ++i) { + for (i = 0 ; i < num_counters ; ++i) { if (CTR_IS_RESERVED(msrs, i)) release_perfctr_nmi(MSR_P6_PERFCTR0 + i); } - for (i = 0 ; i < NUM_CONTROLS ; ++i) { + for (i = 0 ; i < num_counters ; ++i) { if (CTRL_IS_RESERVED(msrs, i)) release_evntsel_nmi(MSR_P6_EVNTSEL0 + i); } + if (reset_value) { + kfree(reset_value); + reset_value = NULL; + } } struct op_x86_model_spec const op_ppro_spec = { - .num_counters = NUM_COUNTERS, - .num_controls = NUM_CONTROLS, + .num_counters = 2, + .num_controls = 2, + .fill_in_addresses = &ppro_fill_in_addresses, + .setup_ctrs = &ppro_setup_ctrs, + .check_ctrs = &ppro_check_ctrs, + .start = &ppro_start, + .stop = &ppro_stop, + .shutdown = &ppro_shutdown +}; + +/* + * Architectural performance monitoring. + * + * Newer Intel CPUs (Core1+) have support for architectural + * events described in CPUID 0xA. See the IA32 SDM Vol3b.18 for details. + * The advantage of this is that it can be done without knowing about + * the specific CPU. + */ + +void arch_perfmon_setup_counters(void) +{ + union cpuid10_eax eax; + + eax.full = cpuid_eax(0xa); + + /* Workaround for BIOS bugs in 6/15. Taken from perfmon2 */ + if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 && + current_cpu_data.x86_model == 15) { + eax.split.version_id = 2; + eax.split.num_counters = 2; + eax.split.bit_width = 40; + } + + num_counters = eax.split.num_counters; + + op_arch_perfmon_spec.num_counters = num_counters; + op_arch_perfmon_spec.num_controls = num_counters; +} + +struct op_x86_model_spec op_arch_perfmon_spec = { + /* num_counters/num_controls filled in at runtime */ .fill_in_addresses = &ppro_fill_in_addresses, + /* user space does the cpuid check for available events */ .setup_ctrs = &ppro_setup_ctrs, .check_ctrs = &ppro_check_ctrs, .start = &ppro_start, diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 3d3b85d3c25..0b601893a4d 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -49,5 +49,8 @@ extern struct op_x86_model_spec const op_ppro_spec; extern struct op_x86_model_spec const op_p4_spec; extern struct op_x86_model_spec const op_p4_ht2_spec; extern struct op_x86_model_spec const op_amd_spec; +extern struct op_x86_model_spec op_arch_perfmon_spec; + +extern void arch_perfmon_setup_counters(void); #endif /* OP_X86_MODEL_H */ -- cgit v1.2.3-70-g09d2 From 59512900baab03c5629f2ff5efad1d5d4e682ece Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 29 Sep 2008 22:23:33 +0200 Subject: oprofile: discover counters for op ppro too Discover number of counters for all family 6 models even when not in arch perfmon mode. Signed-off-by: Andi Kleen Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_ppro.c | 8 +++++--- arch/x86/oprofile/op_x86_model.h | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 12e207a67f1..f5a226823e9 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -200,9 +200,9 @@ static void ppro_shutdown(struct op_msrs const * const msrs) } -struct op_x86_model_spec const op_ppro_spec = { - .num_counters = 2, - .num_controls = 2, +struct op_x86_model_spec op_ppro_spec = { + .num_counters = 2, /* can be overriden */ + .num_controls = 2, /* dito */ .fill_in_addresses = &ppro_fill_in_addresses, .setup_ctrs = &ppro_setup_ctrs, .check_ctrs = &ppro_check_ctrs, @@ -238,6 +238,8 @@ void arch_perfmon_setup_counters(void) op_arch_perfmon_spec.num_counters = num_counters; op_arch_perfmon_spec.num_controls = num_counters; + op_ppro_spec.num_counters = num_counters; + op_ppro_spec.num_controls = num_counters; } struct op_x86_model_spec op_arch_perfmon_spec = { diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 0b601893a4d..596de7a5559 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -45,7 +45,7 @@ struct op_x86_model_spec { void (*shutdown)(struct op_msrs const * const msrs); }; -extern struct op_x86_model_spec const op_ppro_spec; +extern struct op_x86_model_spec op_ppro_spec; extern struct op_x86_model_spec const op_p4_spec; extern struct op_x86_model_spec const op_p4_ht2_spec; extern struct op_x86_model_spec const op_amd_spec; -- cgit v1.2.3-70-g09d2 From 611b1597680dd4a57896bcca1af0484be463c55e Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 21 Jul 2008 18:49:56 +0300 Subject: x86: fix mmiotrace 8-bit register decoding When SIL, DIL, BPL or SPL registers were used in MMIO, the datum was extracted from AH, BH, CH, or DH, which are incorrect. Signed-off-by: Pekka Paalanen Cc: "Vegard Nossum" Cc: "Steven Rostedt" Cc: proski@gnu.org Cc: "Pekka Enberg" Signed-off-by: Ingo Molnar --- arch/x86/mm/pf_in.c | 121 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 84 insertions(+), 37 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c index efa1911e20c..df3d5c861cd 100644 --- a/arch/x86/mm/pf_in.c +++ b/arch/x86/mm/pf_in.c @@ -79,25 +79,34 @@ static unsigned int mw32[] = { 0xC7 }; static unsigned int mw64[] = { 0x89, 0x8B }; #endif /* not __i386__ */ -static int skip_prefix(unsigned char *addr, int *shorted, int *enlarged, - int *rexr) +struct prefix_bits { + unsigned shorted:1; + unsigned enlarged:1; + unsigned rexr:1; + unsigned rex:1; +}; + +static int skip_prefix(unsigned char *addr, struct prefix_bits *prf) { int i; unsigned char *p = addr; - *shorted = 0; - *enlarged = 0; - *rexr = 0; + prf->shorted = 0; + prf->enlarged = 0; + prf->rexr = 0; + prf->rex = 0; restart: for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) { if (*p == prefix_codes[i]) { if (*p == 0x66) - *shorted = 1; + prf->shorted = 1; #ifdef __amd64__ if ((*p & 0xf8) == 0x48) - *enlarged = 1; + prf->enlarged = 1; if ((*p & 0xf4) == 0x44) - *rexr = 1; + prf->rexr = 1; + if ((*p & 0xf0) == 0x40) + prf->rex = 1; #endif p++; goto restart; @@ -135,12 +144,12 @@ enum reason_type get_ins_type(unsigned long ins_addr) { unsigned int opcode; unsigned char *p; - int shorted, enlarged, rexr; + struct prefix_bits prf; int i; enum reason_type rv = OTHERS; p = (unsigned char *)ins_addr; - p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += skip_prefix(p, &prf); p += get_opcode(p, &opcode); CHECK_OP_TYPE(opcode, reg_rop, REG_READ); @@ -156,10 +165,11 @@ static unsigned int get_ins_reg_width(unsigned long ins_addr) { unsigned int opcode; unsigned char *p; - int i, shorted, enlarged, rexr; + struct prefix_bits prf; + int i; p = (unsigned char *)ins_addr; - p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += skip_prefix(p, &prf); p += get_opcode(p, &opcode); for (i = 0; i < ARRAY_SIZE(rw8); i++) @@ -168,7 +178,7 @@ static unsigned int get_ins_reg_width(unsigned long ins_addr) for (i = 0; i < ARRAY_SIZE(rw32); i++) if (rw32[i] == opcode) - return (shorted ? 2 : (enlarged ? 8 : 4)); + return prf.shorted ? 2 : (prf.enlarged ? 8 : 4); printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); return 0; @@ -178,10 +188,11 @@ unsigned int get_ins_mem_width(unsigned long ins_addr) { unsigned int opcode; unsigned char *p; - int i, shorted, enlarged, rexr; + struct prefix_bits prf; + int i; p = (unsigned char *)ins_addr; - p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += skip_prefix(p, &prf); p += get_opcode(p, &opcode); for (i = 0; i < ARRAY_SIZE(mw8); i++) @@ -194,11 +205,11 @@ unsigned int get_ins_mem_width(unsigned long ins_addr) for (i = 0; i < ARRAY_SIZE(mw32); i++) if (mw32[i] == opcode) - return shorted ? 2 : 4; + return prf.shorted ? 2 : 4; for (i = 0; i < ARRAY_SIZE(mw64); i++) if (mw64[i] == opcode) - return shorted ? 2 : (enlarged ? 8 : 4); + return prf.shorted ? 2 : (prf.enlarged ? 8 : 4); printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); return 0; @@ -238,7 +249,7 @@ enum { #endif }; -static unsigned char *get_reg_w8(int no, struct pt_regs *regs) +static unsigned char *get_reg_w8(int no, int rex, struct pt_regs *regs) { unsigned char *rv = NULL; @@ -255,18 +266,6 @@ static unsigned char *get_reg_w8(int no, struct pt_regs *regs) case arg_DL: rv = (unsigned char *)®s->dx; break; - case arg_AH: - rv = 1 + (unsigned char *)®s->ax; - break; - case arg_BH: - rv = 1 + (unsigned char *)®s->bx; - break; - case arg_CH: - rv = 1 + (unsigned char *)®s->cx; - break; - case arg_DH: - rv = 1 + (unsigned char *)®s->dx; - break; #ifdef __amd64__ case arg_R8: rv = (unsigned char *)®s->r8; @@ -294,9 +293,55 @@ static unsigned char *get_reg_w8(int no, struct pt_regs *regs) break; #endif default: - printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no); break; } + + if (rv) + return rv; + + if (rex) { + /* + * If REX prefix exists, access low bytes of SI etc. + * instead of AH etc. + */ + switch (no) { + case arg_SI: + rv = (unsigned char *)®s->si; + break; + case arg_DI: + rv = (unsigned char *)®s->di; + break; + case arg_BP: + rv = (unsigned char *)®s->bp; + break; + case arg_SP: + rv = (unsigned char *)®s->sp; + break; + default: + break; + } + } else { + switch (no) { + case arg_AH: + rv = 1 + (unsigned char *)®s->ax; + break; + case arg_BH: + rv = 1 + (unsigned char *)®s->bx; + break; + case arg_CH: + rv = 1 + (unsigned char *)®s->cx; + break; + case arg_DH: + rv = 1 + (unsigned char *)®s->dx; + break; + default: + break; + } + } + + if (!rv) + printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no); + return rv; } @@ -368,11 +413,12 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs) unsigned char mod_rm; int reg; unsigned char *p; - int i, shorted, enlarged, rexr; + struct prefix_bits prf; + int i; unsigned long rv; p = (unsigned char *)ins_addr; - p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += skip_prefix(p, &prf); p += get_opcode(p, &opcode); for (i = 0; i < ARRAY_SIZE(reg_rop); i++) if (reg_rop[i] == opcode) { @@ -392,10 +438,10 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs) do_work: mod_rm = *p; - reg = ((mod_rm >> 3) & 0x7) | (rexr << 3); + reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3); switch (get_ins_reg_width(ins_addr)) { case 1: - return *get_reg_w8(reg, regs); + return *get_reg_w8(reg, prf.rex, regs); case 2: return *(unsigned short *)get_reg_w32(reg, regs); @@ -422,11 +468,12 @@ unsigned long get_ins_imm_val(unsigned long ins_addr) unsigned char mod_rm; unsigned char mod; unsigned char *p; - int i, shorted, enlarged, rexr; + struct prefix_bits prf; + int i; unsigned long rv; p = (unsigned char *)ins_addr; - p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += skip_prefix(p, &prf); p += get_opcode(p, &opcode); for (i = 0; i < ARRAY_SIZE(imm_wop); i++) if (imm_wop[i] == opcode) { -- cgit v1.2.3-70-g09d2 From 8b1fa1d7b22f386747c7b78b918d4c680c16066f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 29 Jul 2008 12:36:02 +0200 Subject: ftrace: mark lapic_wd_event() notrace it can be called in the NMI path: [ 0.645999] calling ftrace_dynamic_init+0x0/0xd6 [ 0.647521] ------------[ cut here ]------------ [ 0.647521] WARNING: at kernel/trace/ftrace.c:348 ftrace_record_ip+0x4e/0x252() [ 0.647521] Modules linked in: [ 0.647521] Pid: 15, comm: kstop1 Not tainted 2.6.27-rc1-tip #22686 [ 0.647521] [ 0.647521] Call Trace: [ 0.647521] [] warn_on_slowpath+0x5d/0x84 [ 0.647521] [] ? lapic_wd_event+0xb/0x5c [ 0.647521] [] ftrace_record_ip+0x4e/0x252 [ 0.647521] [] mcount_call+0x5/0x31 [ 0.647521] [] ? lapic_wd_event+0x10/0x5c [ 0.647521] [] nmi_watchdog_tick+0x19d/0x1ad [ 0.647521] [] default_do_nmi+0x75/0x1e3 [ 0.647521] [] do_nmi+0x5d/0x94 [ 0.647521] [] nmi+0xa2/0xc2 [ 0.647521] [] ? check_bytes_and_report+0x11/0xcc [ 0.647521] <> [] ? mcount_call+0x5/0x31 [ 0.647521] [] check_object+0x61/0x1b0 [ 0.647521] [] __slab_free+0x169/0x2ae [ 0.647521] [] ? __cleanup_sighand+0x25/0x27 [ 0.647521] [] ? __cleanup_sighand+0x25/0x27 [ 0.647521] [] kmem_cache_free+0x85/0xb9 [ 0.647521] [] __cleanup_sighand+0x25/0x27 [ 0.647521] [] release_task+0x256/0x339 [ 0.647521] [] do_exit+0x764/0x7ef [ 0.647521] [] __xchg+0x0/0x38 [ 0.647521] [] ? stop_cpu+0x0/0xb2 [ 0.647521] [] ? stop_cpu+0x0/0xb2 [ 0.647521] [] kthread+0x4e/0x7b [ 0.647521] [] child_rip+0xa/0x11 [ 0.647521] [] ? restore_args+0x0/0x30 [ 0.647521] [] ? native_load_tls+0x14/0x2e [ 0.647521] [] ? kthread+0x0/0x7b [ 0.647521] [] ? child_rip+0x0/0x11 [ 0.647521] [ 0.647521] ---[ end trace 4eaa2a86a8e2da22 ]--- [ 0.672032] initcall ftrace_dynamic_init+0x0/0xd6 returned 0 after 19 msecs also mark it no-kprobes while at it. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perfctr-watchdog.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 6bff382094f..9abd48b2267 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -17,6 +17,8 @@ #include #include #include +#include + #include #include @@ -336,7 +338,8 @@ static void single_msr_unreserve(void) release_perfctr_nmi(wd_ops->perfctr); } -static void single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) +static void __kprobes +single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) { /* start the cycle over again */ write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz); @@ -401,7 +404,7 @@ static int setup_p6_watchdog(unsigned nmi_hz) return 1; } -static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) +static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) { /* * P6 based Pentium M need to re-unmask @@ -605,7 +608,7 @@ static void p4_unreserve(void) release_perfctr_nmi(MSR_P4_IQ_PERFCTR0); } -static void p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) +static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) { unsigned dummy; /* @@ -784,7 +787,7 @@ unsigned lapic_adjust_nmi_hz(unsigned hz) return hz; } -int lapic_wd_event(unsigned nmi_hz) +int __kprobes lapic_wd_event(unsigned nmi_hz) { struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); u64 ctr; -- cgit v1.2.3-70-g09d2 From e4b2b8866121bd06d5f3d9de0f79a04339ffa252 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Aug 2008 15:45:11 -0400 Subject: ftrace: enable using mcount recording on x86 Enable the use of the __mcount_loc infrastructure on x86_64 and i386. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fc8351f374f..5a34c5427a0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -25,6 +25,7 @@ config X86 select HAVE_KPROBES select ARCH_WANT_OPTIONAL_GPIOLIB select HAVE_KRETPROBES + select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE select HAVE_FTRACE select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) -- cgit v1.2.3-70-g09d2 From 0a37605c2261a06d8cafc62dee11374ad676c8c4 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Aug 2008 15:45:12 -0400 Subject: ftrace: x86 mcount stub x86 now sets up the mcount locations through the build and no longer needs to record the ip when the function is executed. This patch changes the initial mcount to simply return. There's no need to do any other work. If the ftrace start up test fails, the original mcount will be what everything will use, so having this as fast as possible is a good thing. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 14 -------------- arch/x86/kernel/entry_64.S | 26 -------------------------- arch/x86/kernel/ftrace.c | 14 ++------------ 3 files changed, 2 insertions(+), 52 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index b21fbfaffe3..4e4269c73bb 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1153,20 +1153,6 @@ ENDPROC(xen_failsafe_callback) #ifdef CONFIG_DYNAMIC_FTRACE ENTRY(mcount) - pushl %eax - pushl %ecx - pushl %edx - movl 0xc(%esp), %eax - subl $MCOUNT_INSN_SIZE, %eax - -.globl mcount_call -mcount_call: - call ftrace_stub - - popl %edx - popl %ecx - popl %eax - ret END(mcount) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 1db6ce4314e..09e7145484c 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -64,32 +64,6 @@ #ifdef CONFIG_FTRACE #ifdef CONFIG_DYNAMIC_FTRACE ENTRY(mcount) - - subq $0x38, %rsp - movq %rax, (%rsp) - movq %rcx, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rsi, 24(%rsp) - movq %rdi, 32(%rsp) - movq %r8, 40(%rsp) - movq %r9, 48(%rsp) - - movq 0x38(%rsp), %rdi - subq $MCOUNT_INSN_SIZE, %rdi - -.globl mcount_call -mcount_call: - call ftrace_stub - - movq 48(%rsp), %r9 - movq 40(%rsp), %r8 - movq 32(%rsp), %rdi - movq 24(%rsp), %rsi - movq 16(%rsp), %rdx - movq 8(%rsp), %rcx - movq (%rsp), %rax - addq $0x38, %rsp - retq END(mcount) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index ab115cd15fd..96aadbfedcc 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -112,18 +112,8 @@ notrace int ftrace_update_ftrace_func(ftrace_func_t func) notrace int ftrace_mcount_set(unsigned long *data) { - unsigned long ip = (long)(&mcount_call); - unsigned long *addr = data; - unsigned char old[MCOUNT_INSN_SIZE], *new; - - /* - * Replace the mcount stub with a pointer to the - * ip recorder function. - */ - memcpy(old, &mcount_call, MCOUNT_INSN_SIZE); - new = ftrace_call_replace(ip, *addr); - *addr = ftrace_modify_code(ip, old, new); - + /* mcount is initialized as a nop */ + *data = 0; return 0; } -- cgit v1.2.3-70-g09d2 From 732f3ca7d4ba3c1be8d051d52302ef441ee7748b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Aug 2008 18:05:05 -0400 Subject: ftrace: use only 5 byte nops for x86 Mathieu Desnoyers revealed a bug in the original code. The nop that is used to relpace the mcount caller can be a two part nop. This runs the risk where a process can be preempted after executing the first nop, but before the second part of the nop. The ftrace code calls kstop_machine to keep multiple CPUs from executing code that is being modified, but it does not protect against a task preempting in the middle of a two part nop. If the above preemption happens and the tracer is enabled, after the kstop_machine runs, all those nops will be calls to the trace function. If the preempted process that was preempted between the two nops is executed again, it will execute half of the call to the trace function, and this might crash the system. This patch instead uses what both the latest Intel and AMD spec suggests. That is the P6_NOP5 sequence of "0x0f 0x1f 0x44 0x00 0x00". Note, some older CPUs and QEMU might fault on this nop, so this nop is executed with fault handling first. If it detects a fault, it will then use the code "0x66 0x66 0x66 0x66 0x90". If that faults, it will then default to a simple "jmp 1f; .byte 0x00 0x00 0x00; 1:". The jmp is not optimal but will do if the first two can not be executed. TODO: Examine the cpuid to determine the nop to use. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 68 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 61 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 96aadbfedcc..4151c91254e 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -16,8 +16,8 @@ #include #include -#include #include +#include /* Long is fine, even if it is only 4 bytes ;-) */ @@ -119,13 +119,67 @@ notrace int ftrace_mcount_set(unsigned long *data) int __init ftrace_dyn_arch_init(void *data) { - const unsigned char *const *noptable = find_nop_table(); - - /* This is running in kstop_machine */ - - ftrace_mcount_set(data); + extern const unsigned char ftrace_test_p6nop[]; + extern const unsigned char ftrace_test_nop5[]; + extern const unsigned char ftrace_test_jmp[]; + int faulted = 0; - ftrace_nop = (unsigned long *)noptable[MCOUNT_INSN_SIZE]; + /* + * There is no good nop for all x86 archs. + * We will default to using the P6_NOP5, but first we + * will test to make sure that the nop will actually + * work on this CPU. If it faults, we will then + * go to a lesser efficient 5 byte nop. If that fails + * we then just use a jmp as our nop. This isn't the most + * efficient nop, but we can not use a multi part nop + * since we would then risk being preempted in the middle + * of that nop, and if we enabled tracing then, it might + * cause a system crash. + * + * TODO: check the cpuid to determine the best nop. + */ + asm volatile ( + "jmp ftrace_test_jmp\n" + /* This code needs to stay around */ + ".section .text, \"ax\"\n" + "ftrace_test_jmp:" + "jmp ftrace_test_p6nop\n" + ".byte 0x00,0x00,0x00\n" /* 2 byte jmp + 3 bytes */ + "ftrace_test_p6nop:" + P6_NOP5 + "jmp 1f\n" + "ftrace_test_nop5:" + ".byte 0x66,0x66,0x66,0x66,0x90\n" + "jmp 1f\n" + ".previous\n" + "1:" + ".section .fixup, \"ax\"\n" + "2: movl $1, %0\n" + " jmp ftrace_test_nop5\n" + "3: movl $2, %0\n" + " jmp 1b\n" + ".previous\n" + _ASM_EXTABLE(ftrace_test_p6nop, 2b) + _ASM_EXTABLE(ftrace_test_nop5, 3b) + : "=r"(faulted) : "0" (faulted)); + + switch (faulted) { + case 0: + pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n"); + ftrace_nop = (unsigned long *)ftrace_test_p6nop; + break; + case 1: + pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n"); + ftrace_nop = (unsigned long *)ftrace_test_nop5; + break; + case 2: + pr_info("ftrace: converting mcount calls to jmp 1f\n"); + ftrace_nop = (unsigned long *)ftrace_test_jmp; + break; + } + + /* The return code is retured via data */ + *(unsigned long *)data = 0; return 0; } -- cgit v1.2.3-70-g09d2 From 6f93fc076a464bfe24e8d4c5fea3f6ca5bdb264d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 20 Aug 2008 12:55:07 -0400 Subject: ftrace: x86 use copy to and from user functions The modification of code is performed either by kstop_machine, before SMP starts, or on module code before the module is executed. There is no reason to do the modifications from assembly. The copy to and from user functions are sufficient and produces cleaner and easier to read code. Thanks to Benjamin Herrenschmidt for suggesting the idea. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 38 +++++++++++++------------------------- 1 file changed, 13 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 4151c91254e..082d9964262 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -60,11 +61,7 @@ notrace int ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char *new_code) { - unsigned replaced; - unsigned old = *(unsigned *)old_code; /* 4 bytes */ - unsigned new = *(unsigned *)new_code; /* 4 bytes */ - unsigned char newch = new_code[4]; - int faulted = 0; + unsigned char replaced[MCOUNT_INSN_SIZE]; /* * Note: Due to modules and __init, code can @@ -72,29 +69,20 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, * as well as code changing. * * No real locking needed, this code is run through - * kstop_machine. + * kstop_machine, or before SMP starts. */ - asm volatile ( - "1: lock\n" - " cmpxchg %3, (%2)\n" - " jnz 2f\n" - " movb %b4, 4(%2)\n" - "2:\n" - ".section .fixup, \"ax\"\n" - "3: movl $1, %0\n" - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) - : "=r"(faulted), "=a"(replaced) - : "r"(ip), "r"(new), "c"(newch), - "0"(faulted), "a"(old) - : "memory"); - sync_core(); + if (__copy_from_user(replaced, (char __user *)ip, MCOUNT_INSN_SIZE)) + return 1; - if (replaced != old && replaced != new) - faulted = 2; + if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0) + return 2; - return faulted; + WARN_ON_ONCE(__copy_to_user((char __user *)ip, new_code, + MCOUNT_INSN_SIZE)); + + sync_core(); + + return 0; } notrace int ftrace_update_ftrace_func(ftrace_func_t func) -- cgit v1.2.3-70-g09d2 From bbe5c7830c6dbde58726d44ec0337bc8b2d95d37 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Tue, 16 Sep 2008 21:54:16 +0300 Subject: x86 mmiotrace: fix a rare memory leak Signed-off-by: Pekka Paalanen Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/mm/mmio-mod.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 635b50e8558..754bd1eaf4f 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -307,8 +307,10 @@ static void ioremap_trace_core(resource_size_t offset, unsigned long size, map.map_id = trace->id; spin_lock_irq(&trace_lock); - if (!is_enabled()) + if (!is_enabled()) { + kfree(trace); goto not_enabled; + } mmio_trace_mapping(&map); list_add_tail(&trace->list, &trace_list); -- cgit v1.2.3-70-g09d2 From 9e57fb35d711331a9b1410c5c56ebeb3733428a0 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Tue, 16 Sep 2008 22:00:34 +0300 Subject: x86 mmiotrace: implement mmiotrace_printk() Offer mmiotrace users a function to inject markers from inside the kernel. This depends on the trace_vprintk() patch. Signed-off-by: Pekka Paalanen Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/mm/mmio-mod.c | 19 ++++++++++++++++++- arch/x86/mm/testmmiotrace.c | 4 ++++ include/linux/mmiotrace.h | 17 +++++++++++++++-- kernel/trace/trace_mmiotrace.c | 5 +++++ 4 files changed, 42 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 754bd1eaf4f..5e2e2e72ee8 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -75,7 +75,7 @@ static LIST_HEAD(trace_list); /* struct remap_trace */ * and trace_lock. * - Routines depending on is_enabled() must take trace_lock. * - trace_list users must hold trace_lock. - * - is_enabled() guarantees that mmio_trace_record is allowed. + * - is_enabled() guarantees that mmio_trace_{rw,mapping} are allowed. * - pre/post callbacks assume the effect of is_enabled() being true. */ @@ -379,6 +379,23 @@ void mmiotrace_iounmap(volatile void __iomem *addr) iounmap_trace_core(addr); } +int mmiotrace_printk(const char *fmt, ...) +{ + int ret = 0; + va_list args; + unsigned long flags; + va_start(args, fmt); + + spin_lock_irqsave(&trace_lock, flags); + if (is_enabled()) + ret = mmio_trace_printk(fmt, args); + spin_unlock_irqrestore(&trace_lock, flags); + + va_end(args); + return ret; +} +EXPORT_SYMBOL(mmiotrace_printk); + static void clear_trace_list(void) { struct remap_trace *trace; diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c index d877c5b423e..ab50a8d7402 100644 --- a/arch/x86/mm/testmmiotrace.c +++ b/arch/x86/mm/testmmiotrace.c @@ -3,6 +3,7 @@ */ #include #include +#include #define MODULE_NAME "testmmiotrace" @@ -13,6 +14,7 @@ MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB."); static void do_write_test(void __iomem *p) { unsigned int i; + mmiotrace_printk("Write test.\n"); for (i = 0; i < 256; i++) iowrite8(i, p + i); for (i = 1024; i < (5 * 1024); i += 2) @@ -24,6 +26,7 @@ static void do_write_test(void __iomem *p) static void do_read_test(void __iomem *p) { unsigned int i; + mmiotrace_printk("Read test.\n"); for (i = 0; i < 256; i++) ioread8(p + i); for (i = 1024; i < (5 * 1024); i += 2) @@ -39,6 +42,7 @@ static void do_test(void) pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); return; } + mmiotrace_printk("ioremap returned %p.\n", p); do_write_test(p); do_read_test(p); iounmap(p); diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h index 61d19e1b7a0..60cc3bf5c53 100644 --- a/include/linux/mmiotrace.h +++ b/include/linux/mmiotrace.h @@ -34,11 +34,15 @@ extern void unregister_kmmio_probe(struct kmmio_probe *p); /* Called from page fault handler. */ extern int kmmio_handler(struct pt_regs *regs, unsigned long addr); -/* Called from ioremap.c */ #ifdef CONFIG_MMIOTRACE +/* Called from ioremap.c */ extern void mmiotrace_ioremap(resource_size_t offset, unsigned long size, void __iomem *addr); extern void mmiotrace_iounmap(volatile void __iomem *addr); + +/* For anyone to insert markers. Remember trailing newline. */ +extern int mmiotrace_printk(const char *fmt, ...) + __attribute__ ((format (printf, 1, 2))); #else static inline void mmiotrace_ioremap(resource_size_t offset, unsigned long size, void __iomem *addr) @@ -48,7 +52,15 @@ static inline void mmiotrace_ioremap(resource_size_t offset, static inline void mmiotrace_iounmap(volatile void __iomem *addr) { } -#endif /* CONFIG_MMIOTRACE_HOOKS */ + +static inline int mmiotrace_printk(const char *fmt, ...) + __attribute__ ((format (printf, 1, 0))); + +static inline int mmiotrace_printk(const char *fmt, ...) +{ + return 0; +} +#endif /* CONFIG_MMIOTRACE */ enum mm_io_opcode { MMIO_READ = 0x1, /* struct mmiotrace_rw */ @@ -81,5 +93,6 @@ extern void enable_mmiotrace(void); extern void disable_mmiotrace(void); extern void mmio_trace_rw(struct mmiotrace_rw *rw); extern void mmio_trace_mapping(struct mmiotrace_map *map); +extern int mmio_trace_printk(const char *fmt, va_list args); #endif /* MMIOTRACE_H */ diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index ef02747b26d..767d1faf56e 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -335,3 +335,8 @@ void mmio_trace_mapping(struct mmiotrace_map *map) __trace_mmiotrace_map(tr, data, map); preempt_enable(); } + +int mmio_trace_printk(const char *fmt, va_list args) +{ + return trace_vprintk(0, fmt, args); +} -- cgit v1.2.3-70-g09d2 From 4427414170a63331a9cc36b9598502c5cdfe453b Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Tue, 16 Sep 2008 22:03:56 +0300 Subject: mmiotrace: remove left-over marker cruft Signed-off-by: Pekka Paalanen Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/mm/mmio-mod.c | 64 ----------------------------------------------- include/linux/mmiotrace.h | 3 +-- 2 files changed, 1 insertion(+), 66 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 5e2e2e72ee8..2c4baa88f2c 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -56,13 +56,6 @@ struct remap_trace { static DEFINE_PER_CPU(struct trap_reason, pf_reason); static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace); -#if 0 /* XXX: no way gather this info anymore */ -/* Access to this is not per-cpu. */ -static DEFINE_PER_CPU(atomic_t, dropped); -#endif - -static struct dentry *marker_file; - static DEFINE_MUTEX(mmiotrace_mutex); static DEFINE_SPINLOCK(trace_lock); static atomic_t mmiotrace_enabled; @@ -97,44 +90,6 @@ static bool is_enabled(void) return atomic_read(&mmiotrace_enabled); } -#if 0 /* XXX: needs rewrite */ -/* - * Write callback for the debugfs entry: - * Read a marker and write it to the mmio trace log - */ -static ssize_t write_marker(struct file *file, const char __user *buffer, - size_t count, loff_t *ppos) -{ - char *event = NULL; - struct mm_io_header *headp; - ssize_t len = (count > 65535) ? 65535 : count; - - event = kzalloc(sizeof(*headp) + len, GFP_KERNEL); - if (!event) - return -ENOMEM; - - headp = (struct mm_io_header *)event; - headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT); - headp->data_len = len; - - if (copy_from_user(event + sizeof(*headp), buffer, len)) { - kfree(event); - return -EFAULT; - } - - spin_lock_irq(&trace_lock); -#if 0 /* XXX: convert this to use tracing */ - if (is_enabled()) - relay_write(chan, event, sizeof(*headp) + len); - else -#endif - len = -EINVAL; - spin_unlock_irq(&trace_lock); - kfree(event); - return len; -} -#endif - static void print_pte(unsigned long address) { unsigned int level; @@ -481,26 +436,12 @@ static void leave_uniprocessor(void) } #endif -#if 0 /* XXX: out of order */ -static struct file_operations fops_marker = { - .owner = THIS_MODULE, - .write = write_marker -}; -#endif - void enable_mmiotrace(void) { mutex_lock(&mmiotrace_mutex); if (is_enabled()) goto out; -#if 0 /* XXX: tracing does not support text entries */ - marker_file = debugfs_create_file("marker", 0660, dir, NULL, - &fops_marker); - if (!marker_file) - pr_err(NAME "marker file creation failed.\n"); -#endif - if (nommiotrace) pr_info(NAME "MMIO tracing disabled.\n"); enter_uniprocessor(); @@ -525,11 +466,6 @@ void disable_mmiotrace(void) clear_trace_list(); /* guarantees: no more kmmio callbacks */ leave_uniprocessor(); - if (marker_file) { - debugfs_remove(marker_file); - marker_file = NULL; - } - pr_info(NAME "disabled.\n"); out: mutex_unlock(&mmiotrace_mutex); diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h index 60cc3bf5c53..139d7c88d9c 100644 --- a/include/linux/mmiotrace.h +++ b/include/linux/mmiotrace.h @@ -67,8 +67,7 @@ enum mm_io_opcode { MMIO_WRITE = 0x2, /* struct mmiotrace_rw */ MMIO_PROBE = 0x3, /* struct mmiotrace_map */ MMIO_UNPROBE = 0x4, /* struct mmiotrace_map */ - MMIO_MARKER = 0x5, /* raw char data */ - MMIO_UNKNOWN_OP = 0x6, /* struct mmiotrace_rw */ + MMIO_UNKNOWN_OP = 0x5, /* struct mmiotrace_rw */ }; struct mmiotrace_rw { -- cgit v1.2.3-70-g09d2 From 37a52f5ef120b93734bb2461744512b55695f69c Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Tue, 23 Sep 2008 15:05:46 -0700 Subject: x86: suppress trivial sparse signedness warnings Could just as easily change the three casts to cast to the correct type...this patch changes the type of ftrace_nop instead. Supresses sparse warnings: arch/x86/kernel/ftrace.c:157:14: warning: incorrect type in assignment (different signedness) arch/x86/kernel/ftrace.c:157:14: expected long *static [toplevel] ftrace_nop arch/x86/kernel/ftrace.c:157:14: got unsigned long * arch/x86/kernel/ftrace.c:161:14: warning: incorrect type in assignment (different signedness) arch/x86/kernel/ftrace.c:161:14: expected long *static [toplevel] ftrace_nop arch/x86/kernel/ftrace.c:161:14: got unsigned long * arch/x86/kernel/ftrace.c:165:14: warning: incorrect type in assignment (different signedness) arch/x86/kernel/ftrace.c:165:14: expected long *static [toplevel] ftrace_nop arch/x86/kernel/ftrace.c:165:14: got unsigned long * Signed-off-by: Harvey Harrison Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 082d9964262..66d900248fc 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -22,7 +22,7 @@ /* Long is fine, even if it is only 4 bytes ;-) */ -static long *ftrace_nop; +static unsigned long *ftrace_nop; union ftrace_code_union { char code[MCOUNT_INSN_SIZE]; -- cgit v1.2.3-70-g09d2 From ac2b86fdef5b44f194eefaa6b7b6aea9423d1bc2 Mon Sep 17 00:00:00 2001 From: Frédéric Weisbecker Date: Wed, 24 Sep 2008 16:31:56 +0100 Subject: x86/ftrace: use uaccess in atomic context With latest -tip I get this bug: [ 49.439988] in_atomic():0, irqs_disabled():1 [ 49.440118] INFO: lockdep is turned off. [ 49.440118] Pid: 2814, comm: modprobe Tainted: G W 2.6.27-rc7 #4 [ 49.440118] [] __might_sleep+0xe1/0x120 [ 49.440118] [] ftrace_modify_code+0x2a/0xd0 [ 49.440118] [] ? ftrace_test_p6nop+0x0/0xa [ 49.440118] [] __ftrace_update_code+0xfe/0x2f0 [ 49.440118] [] ? ftrace_test_p6nop+0x0/0xa [ 49.440118] [] ftrace_convert_nops+0x50/0x80 [ 49.440118] [] ftrace_init_module+0x16/0x20 [ 49.440118] [] load_module+0x185b/0x1d30 [ 49.440118] [] ? find_get_page+0x0/0xf0 [ 49.440118] [] ? sprintf+0x0/0x30 [ 49.440118] [] ? mutex_lock_interruptible_nested+0x1f2/0x350 [ 49.440118] [] sys_init_module+0x53/0x1b0 [ 49.440118] [] ? do_page_fault+0x0/0x740 [ 49.440118] [] syscall_call+0x7/0xb [ 49.440118] ======================= It is because ftrace_modify_code() calls copy_to_user and copy_from_user. These functions have been inserted after guessing that there couldn't be any race condition but copy_[to/from]_user might sleep and __ftrace_update_code is called with local_irq_saved. These function have been inserted since this commit: d5e92e8978fd2574e415dc2792c5eb592978243d: "ftrace: x86 use copy from user function" Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 66d900248fc..222507e8157 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -71,13 +71,13 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, * No real locking needed, this code is run through * kstop_machine, or before SMP starts. */ - if (__copy_from_user(replaced, (char __user *)ip, MCOUNT_INSN_SIZE)) + if (__copy_from_user_inatomic(replaced, (char __user *)ip, MCOUNT_INSN_SIZE)) return 1; if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0) return 2; - WARN_ON_ONCE(__copy_to_user((char __user *)ip, new_code, + WARN_ON_ONCE(__copy_to_user_inatomic((char __user *)ip, new_code, MCOUNT_INSN_SIZE)); sync_core(); -- cgit v1.2.3-70-g09d2 From 8b27386a9ce9c7f0f8702cff7565a46802ad57d1 Mon Sep 17 00:00:00 2001 From: Anders Kaseorg Date: Thu, 9 Oct 2008 22:19:08 -0400 Subject: ftrace: make ftrace_test_p6nop disassembler-friendly Commit 4c3dc21b136f8cb4b72afee16c3ba7e961656c0b in tip introduced the 5-byte NOP ftrace_test_p6nop: jmp . + 5 .byte 0x00, 0x00, 0x00 This is not friendly to disassemblers because an odd number of 0x00s ends in the middle of an instruction boundary. This changes the 0x00s to 1-byte NOPs (0x90). Signed-off-by: Anders Kaseorg Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 222507e8157..d073d981a73 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -132,7 +132,9 @@ int __init ftrace_dyn_arch_init(void *data) ".section .text, \"ax\"\n" "ftrace_test_jmp:" "jmp ftrace_test_p6nop\n" - ".byte 0x00,0x00,0x00\n" /* 2 byte jmp + 3 bytes */ + "nop\n" + "nop\n" + "nop\n" /* 2 byte jmp + 3 bytes */ "ftrace_test_p6nop:" P6_NOP5 "jmp 1f\n" @@ -161,7 +163,7 @@ int __init ftrace_dyn_arch_init(void *data) ftrace_nop = (unsigned long *)ftrace_test_nop5; break; case 2: - pr_info("ftrace: converting mcount calls to jmp 1f\n"); + pr_info("ftrace: converting mcount calls to jmp . + 5\n"); ftrace_nop = (unsigned long *)ftrace_test_jmp; break; } -- cgit v1.2.3-70-g09d2 From ca60dfbb69afb549e33527cbf676e4daf8febfb5 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Tue, 24 Jun 2008 17:02:38 +0800 Subject: KVM: VMX: Rename misnamed msr bits MSR_IA32_FEATURE_LOCKED is just a bit in fact, which shouldn't be prefixed with MSR_. So is MSR_IA32_FEATURE_VMXON_ENABLED. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 18 +++++++++--------- arch/x86/kvm/vmx.h | 4 ++-- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7041cc52b56..81dac721ec3 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1031,9 +1031,9 @@ static __init int vmx_disabled_by_bios(void) u64 msr; rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); - return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED | - MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) - == MSR_IA32_FEATURE_CONTROL_LOCKED; + return (msr & (IA32_FEATURE_CONTROL_LOCKED_BIT | + IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT)) + == IA32_FEATURE_CONTROL_LOCKED_BIT; /* locked but not enabled */ } @@ -1045,14 +1045,14 @@ static void hardware_enable(void *garbage) INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); rdmsrl(MSR_IA32_FEATURE_CONTROL, old); - if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED | - MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) - != (MSR_IA32_FEATURE_CONTROL_LOCKED | - MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) + if ((old & (IA32_FEATURE_CONTROL_LOCKED_BIT | + IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT)) + != (IA32_FEATURE_CONTROL_LOCKED_BIT | + IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT)) /* enable and lock */ wrmsrl(MSR_IA32_FEATURE_CONTROL, old | - MSR_IA32_FEATURE_CONTROL_LOCKED | - MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED); + IA32_FEATURE_CONTROL_LOCKED_BIT | + IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT); write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr) diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h index 17e25995b65..86059f439cb 100644 --- a/arch/x86/kvm/vmx.h +++ b/arch/x86/kvm/vmx.h @@ -331,8 +331,8 @@ enum vmcs_field { #define AR_RESERVD_MASK 0xfffe0f00 -#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1 -#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4 +#define IA32_FEATURE_CONTROL_LOCKED_BIT 0x1 +#define IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT 0x4 #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10 -- cgit v1.2.3-70-g09d2 From 5fdbf9765b7ba6a45100851154768de703d51e76 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 27 Jun 2008 14:58:02 -0300 Subject: KVM: x86: accessors for guest registers As suggested by Avi, introduce accessors to read/write guest registers. This simplifies the ->cache_regs/->decache_regs interface, and improves register caching which is important for VMX, where the cost of vmcs_read/vmcs_write is significant. [avi: fix warnings] Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/kvm_cache_regs.h | 32 +++++ arch/x86/kvm/lapic.c | 4 +- arch/x86/kvm/svm.c | 56 ++++----- arch/x86/kvm/vmx.c | 103 ++++++++-------- arch/x86/kvm/x86.c | 268 ++++++++++++++++++++++-------------------- arch/x86/kvm/x86_emulate.c | 19 +-- include/asm-x86/kvm_host.h | 15 ++- 7 files changed, 264 insertions(+), 233 deletions(-) create mode 100644 arch/x86/kvm/kvm_cache_regs.h (limited to 'arch/x86') diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h new file mode 100644 index 00000000000..1ff819dce7d --- /dev/null +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -0,0 +1,32 @@ +#ifndef ASM_KVM_CACHE_REGS_H +#define ASM_KVM_CACHE_REGS_H + +static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, + enum kvm_reg reg) +{ + if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail)) + kvm_x86_ops->cache_reg(vcpu, reg); + + return vcpu->arch.regs[reg]; +} + +static inline void kvm_register_write(struct kvm_vcpu *vcpu, + enum kvm_reg reg, + unsigned long val) +{ + vcpu->arch.regs[reg] = val; + __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); + __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); +} + +static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu) +{ + return kvm_register_read(vcpu, VCPU_REGS_RIP); +} + +static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val) +{ + kvm_register_write(vcpu, VCPU_REGS_RIP, val); +} + +#endif diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 73f43de69f6..9fde0ac2426 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -32,6 +32,7 @@ #include #include #include +#include "kvm_cache_regs.h" #include "irq.h" #define PRId64 "d" @@ -558,8 +559,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write) struct kvm_run *run = vcpu->run; set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests); - kvm_x86_ops->cache_regs(vcpu); - run->tpr_access.rip = vcpu->arch.rip; + run->tpr_access.rip = kvm_rip_read(vcpu); run->tpr_access.is_write = write; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8233b86c778..54b0bf33e21 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -18,6 +18,7 @@ #include "kvm_svm.h" #include "irq.h" #include "mmu.h" +#include "kvm_cache_regs.h" #include #include @@ -236,13 +237,11 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) printk(KERN_DEBUG "%s: NOP\n", __func__); return; } - if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) - printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", - __func__, - svm->vmcb->save.rip, - svm->next_rip); + if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE) + printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n", + __func__, kvm_rip_read(vcpu), svm->next_rip); - vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip; + kvm_rip_write(vcpu, svm->next_rip); svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; vcpu->arch.interrupt_window_open = 1; @@ -581,6 +580,7 @@ static void init_vmcb(struct vcpu_svm *svm) save->dr7 = 0x400; save->rflags = 2; save->rip = 0x0000fff0; + svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; /* * cr0 val on cpu init should be 0x60000010, we enable cpu @@ -615,10 +615,12 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu) init_vmcb(svm); if (vcpu->vcpu_id != 0) { - svm->vmcb->save.rip = 0; + kvm_rip_write(vcpu, 0); svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; } + vcpu->arch.regs_avail = ~0; + vcpu->arch.regs_dirty = ~0; return 0; } @@ -721,23 +723,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) rdtscll(vcpu->arch.host_tsc); } -static void svm_cache_regs(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; - vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; - vcpu->arch.rip = svm->vmcb->save.rip; -} - -static void svm_decache_regs(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; - svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; - svm->vmcb->save.rip = vcpu->arch.rip; -} - static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) { return to_svm(vcpu)->vmcb->save.rflags; @@ -1139,14 +1124,14 @@ static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - svm->next_rip = svm->vmcb->save.rip + 1; + svm->next_rip = kvm_rip_read(&svm->vcpu) + 1; skip_emulated_instruction(&svm->vcpu); return kvm_emulate_halt(&svm->vcpu); } static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - svm->next_rip = svm->vmcb->save.rip + 3; + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); kvm_emulate_hypercall(&svm->vcpu); return 1; @@ -1178,7 +1163,7 @@ static int task_switch_interception(struct vcpu_svm *svm, static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - svm->next_rip = svm->vmcb->save.rip + 2; + svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; kvm_emulate_cpuid(&svm->vcpu); return 1; } @@ -1273,9 +1258,9 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32), handler); - svm->vmcb->save.rax = data & 0xffffffff; + svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; - svm->next_rip = svm->vmcb->save.rip + 2; + svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; skip_emulated_instruction(&svm->vcpu); } return 1; @@ -1359,13 +1344,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; - u64 data = (svm->vmcb->save.rax & -1u) + u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32), handler); - svm->next_rip = svm->vmcb->save.rip + 2; + svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; if (svm_set_msr(&svm->vcpu, ecx, data)) kvm_inject_gp(&svm->vcpu, 0); else @@ -1723,6 +1708,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) u16 gs_selector; u16 ldt_selector; + svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; + svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; + svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; + pre_svm_run(svm); sync_lapic_to_cr8(vcpu); @@ -1858,6 +1847,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) load_db_regs(svm->host_db_regs); vcpu->arch.cr2 = svm->vmcb->save.cr2; + vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; + vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; + vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; write_dr6(svm->host_dr6); write_dr7(svm->host_dr7); @@ -1977,8 +1969,6 @@ static struct kvm_x86_ops svm_x86_ops = { .set_gdt = svm_set_gdt, .get_dr = svm_get_dr, .set_dr = svm_set_dr, - .cache_regs = svm_cache_regs, - .decache_regs = svm_decache_regs, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 81dac721ec3..5a3a0326c27 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -26,6 +26,7 @@ #include #include #include +#include "kvm_cache_regs.h" #include #include @@ -715,9 +716,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) unsigned long rip; u32 interruptibility; - rip = vmcs_readl(GUEST_RIP); + rip = kvm_rip_read(vcpu); rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); - vmcs_writel(GUEST_RIP, rip); + kvm_rip_write(vcpu, rip); /* * We emulated an instruction, so temporary interrupt blocking @@ -947,24 +948,19 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) return ret; } -/* - * Sync the rsp and rip registers into the vcpu structure. This allows - * registers to be accessed by indexing vcpu->arch.regs. - */ -static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu) -{ - vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); - vcpu->arch.rip = vmcs_readl(GUEST_RIP); -} - -/* - * Syncs rsp and rip back into the vmcs. Should be called after possible - * modification. - */ -static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu) +static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { - vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); - vmcs_writel(GUEST_RIP, vcpu->arch.rip); + __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); + switch (reg) { + case VCPU_REGS_RSP: + vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); + break; + case VCPU_REGS_RIP: + vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); + break; + default: + break; + } } static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) @@ -2019,6 +2015,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) u64 msr; int ret; + vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); down_read(&vcpu->kvm->slots_lock); if (!init_rmode(vmx->vcpu.kvm)) { ret = -ENOMEM; @@ -2072,10 +2069,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_RFLAGS, 0x02); if (vmx->vcpu.vcpu_id == 0) - vmcs_writel(GUEST_RIP, 0xfff0); + kvm_rip_write(vcpu, 0xfff0); else - vmcs_writel(GUEST_RIP, 0); - vmcs_writel(GUEST_RSP, 0); + kvm_rip_write(vcpu, 0); + kvm_register_write(vcpu, VCPU_REGS_RSP, 0); /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */ vmcs_writel(GUEST_DR7, 0x400); @@ -2139,11 +2136,11 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) if (vcpu->arch.rmode.active) { vmx->rmode.irq.pending = true; vmx->rmode.irq.vector = irq; - vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP); + vmx->rmode.irq.rip = kvm_rip_read(vcpu); vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); - vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1); + kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); return; } vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, @@ -2288,7 +2285,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } error_code = 0; - rip = vmcs_readl(GUEST_RIP); + rip = kvm_rip_read(vcpu); if (intr_info & INTR_INFO_DELIVER_CODE_MASK) error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); if (is_page_fault(intr_info)) { @@ -2386,27 +2383,25 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) reg = (exit_qualification >> 8) & 15; switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ - KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)vcpu->arch.regs[reg], - (u32)((u64)vcpu->arch.regs[reg] >> 32), handler); + KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, + (u32)kvm_register_read(vcpu, reg), + (u32)((u64)kvm_register_read(vcpu, reg) >> 32), + handler); switch (cr) { case 0: - vcpu_load_rsp_rip(vcpu); - kvm_set_cr0(vcpu, vcpu->arch.regs[reg]); + kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg)); skip_emulated_instruction(vcpu); return 1; case 3: - vcpu_load_rsp_rip(vcpu); - kvm_set_cr3(vcpu, vcpu->arch.regs[reg]); + kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg)); skip_emulated_instruction(vcpu); return 1; case 4: - vcpu_load_rsp_rip(vcpu); - kvm_set_cr4(vcpu, vcpu->arch.regs[reg]); + kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg)); skip_emulated_instruction(vcpu); return 1; case 8: - vcpu_load_rsp_rip(vcpu); - kvm_set_cr8(vcpu, vcpu->arch.regs[reg]); + kvm_set_cr8(vcpu, kvm_register_read(vcpu, reg)); skip_emulated_instruction(vcpu); if (irqchip_in_kernel(vcpu->kvm)) return 1; @@ -2415,7 +2410,6 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) }; break; case 2: /* clts */ - vcpu_load_rsp_rip(vcpu); vmx_fpu_deactivate(vcpu); vcpu->arch.cr0 &= ~X86_CR0_TS; vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); @@ -2426,21 +2420,17 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) case 1: /*mov from cr*/ switch (cr) { case 3: - vcpu_load_rsp_rip(vcpu); - vcpu->arch.regs[reg] = vcpu->arch.cr3; - vcpu_put_rsp_rip(vcpu); + kvm_register_write(vcpu, reg, vcpu->arch.cr3); KVMTRACE_3D(CR_READ, vcpu, (u32)cr, - (u32)vcpu->arch.regs[reg], - (u32)((u64)vcpu->arch.regs[reg] >> 32), + (u32)kvm_register_read(vcpu, reg), + (u32)((u64)kvm_register_read(vcpu, reg) >> 32), handler); skip_emulated_instruction(vcpu); return 1; case 8: - vcpu_load_rsp_rip(vcpu); - vcpu->arch.regs[reg] = kvm_get_cr8(vcpu); - vcpu_put_rsp_rip(vcpu); + kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu)); KVMTRACE_2D(CR_READ, vcpu, (u32)cr, - (u32)vcpu->arch.regs[reg], handler); + (u32)kvm_register_read(vcpu, reg), handler); skip_emulated_instruction(vcpu); return 1; } @@ -2472,7 +2462,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) exit_qualification = vmcs_readl(EXIT_QUALIFICATION); dr = exit_qualification & 7; reg = (exit_qualification >> 8) & 15; - vcpu_load_rsp_rip(vcpu); if (exit_qualification & 16) { /* mov from dr */ switch (dr) { @@ -2485,12 +2474,11 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) default: val = 0; } - vcpu->arch.regs[reg] = val; + kvm_register_write(vcpu, reg, val); KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); } else { /* mov to dr */ } - vcpu_put_rsp_rip(vcpu); skip_emulated_instruction(vcpu); return 1; } @@ -2735,8 +2723,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); u32 vectoring_info = vmx->idt_vectoring_info; - KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)vmcs_readl(GUEST_RIP), - (u32)((u64)vmcs_readl(GUEST_RIP) >> 32), entryexit); + KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu), + (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit); /* Access CR3 don't cause VMExit in paging mode, so we need * to sync with guest real CR3. */ @@ -2922,9 +2910,9 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) static void fixup_rmode_irq(struct vcpu_vmx *vmx) { vmx->rmode.irq.pending = 0; - if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip) + if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip) return; - vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip); + kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip); if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; @@ -2941,6 +2929,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) struct vcpu_vmx *vmx = to_vmx(vcpu); u32 intr_info; + if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) + vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); + if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) + vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); + /* * Loading guest fpu may have cleared host cr0.ts */ @@ -3061,6 +3054,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) #endif ); + vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); + vcpu->arch.regs_dirty = 0; + vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); if (vmx->rmode.irq.pending) fixup_rmode_irq(vmx); @@ -3224,8 +3220,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_idt = vmx_set_idt, .get_gdt = vmx_get_gdt, .set_gdt = vmx_set_gdt, - .cache_regs = vcpu_load_rsp_rip, - .decache_regs = vcpu_put_rsp_rip, + .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0d682fc6aeb..2f0696bc7d2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -19,6 +19,7 @@ #include "mmu.h" #include "i8254.h" #include "tss.h" +#include "kvm_cache_regs.h" #include #include @@ -61,6 +62,7 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries); struct kvm_x86_ops *kvm_x86_ops; +EXPORT_SYMBOL_GPL(kvm_x86_ops); struct kvm_stats_debugfs_item debugfs_entries[] = { { "pf_fixed", VCPU_STAT(pf_fixed) }, @@ -2080,7 +2082,7 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) { u8 opcodes[4]; - unsigned long rip = vcpu->arch.rip; + unsigned long rip = kvm_rip_read(vcpu); unsigned long rip_linear; if (!printk_ratelimit()) @@ -2102,6 +2104,14 @@ static struct x86_emulate_ops emulate_ops = { .cmpxchg_emulated = emulator_cmpxchg_emulated, }; +static void cache_all_regs(struct kvm_vcpu *vcpu) +{ + kvm_register_read(vcpu, VCPU_REGS_RAX); + kvm_register_read(vcpu, VCPU_REGS_RSP); + kvm_register_read(vcpu, VCPU_REGS_RIP); + vcpu->arch.regs_dirty = ~0; +} + int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, unsigned long cr2, @@ -2112,7 +2122,13 @@ int emulate_instruction(struct kvm_vcpu *vcpu, struct decode_cache *c; vcpu->arch.mmio_fault_cr2 = cr2; - kvm_x86_ops->cache_regs(vcpu); + /* + * TODO: fix x86_emulate.c to use guest_read/write_register + * instead of direct ->regs accesses, can save hundred cycles + * on Intel for instructions that don't read/change RSP, for + * for example. + */ + cache_all_regs(vcpu); vcpu->mmio_is_write = 0; vcpu->arch.pio.string = 0; @@ -2172,7 +2188,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu, return EMULATE_DO_MMIO; } - kvm_x86_ops->decache_regs(vcpu); kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); if (vcpu->mmio_is_write) { @@ -2225,20 +2240,19 @@ int complete_pio(struct kvm_vcpu *vcpu) struct kvm_pio_request *io = &vcpu->arch.pio; long delta; int r; - - kvm_x86_ops->cache_regs(vcpu); + unsigned long val; if (!io->string) { - if (io->in) - memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data, - io->size); + if (io->in) { + val = kvm_register_read(vcpu, VCPU_REGS_RAX); + memcpy(&val, vcpu->arch.pio_data, io->size); + kvm_register_write(vcpu, VCPU_REGS_RAX, val); + } } else { if (io->in) { r = pio_copy_data(vcpu); - if (r) { - kvm_x86_ops->cache_regs(vcpu); + if (r) return r; - } } delta = 1; @@ -2248,19 +2262,24 @@ int complete_pio(struct kvm_vcpu *vcpu) * The size of the register should really depend on * current address size. */ - vcpu->arch.regs[VCPU_REGS_RCX] -= delta; + val = kvm_register_read(vcpu, VCPU_REGS_RCX); + val -= delta; + kvm_register_write(vcpu, VCPU_REGS_RCX, val); } if (io->down) delta = -delta; delta *= io->size; - if (io->in) - vcpu->arch.regs[VCPU_REGS_RDI] += delta; - else - vcpu->arch.regs[VCPU_REGS_RSI] += delta; + if (io->in) { + val = kvm_register_read(vcpu, VCPU_REGS_RDI); + val += delta; + kvm_register_write(vcpu, VCPU_REGS_RDI, val); + } else { + val = kvm_register_read(vcpu, VCPU_REGS_RSI); + val += delta; + kvm_register_write(vcpu, VCPU_REGS_RSI, val); + } } - kvm_x86_ops->decache_regs(vcpu); - io->count -= io->cur_count; io->cur_count = 0; @@ -2313,6 +2332,7 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, int size, unsigned port) { struct kvm_io_device *pio_dev; + unsigned long val; vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; @@ -2333,8 +2353,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, handler); - kvm_x86_ops->cache_regs(vcpu); - memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4); + val = kvm_register_read(vcpu, VCPU_REGS_RAX); + memcpy(vcpu->arch.pio_data, &val, 4); kvm_x86_ops->skip_emulated_instruction(vcpu); @@ -2519,13 +2539,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) unsigned long nr, a0, a1, a2, a3, ret; int r = 1; - kvm_x86_ops->cache_regs(vcpu); - - nr = vcpu->arch.regs[VCPU_REGS_RAX]; - a0 = vcpu->arch.regs[VCPU_REGS_RBX]; - a1 = vcpu->arch.regs[VCPU_REGS_RCX]; - a2 = vcpu->arch.regs[VCPU_REGS_RDX]; - a3 = vcpu->arch.regs[VCPU_REGS_RSI]; + nr = kvm_register_read(vcpu, VCPU_REGS_RAX); + a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); + a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); + a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); + a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler); @@ -2548,8 +2566,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) ret = -KVM_ENOSYS; break; } - vcpu->arch.regs[VCPU_REGS_RAX] = ret; - kvm_x86_ops->decache_regs(vcpu); + kvm_register_write(vcpu, VCPU_REGS_RAX, ret); ++vcpu->stat.hypercalls; return r; } @@ -2559,6 +2576,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) { char instruction[3]; int ret = 0; + unsigned long rip = kvm_rip_read(vcpu); /* @@ -2568,9 +2586,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) */ kvm_mmu_zap_all(vcpu->kvm); - kvm_x86_ops->cache_regs(vcpu); kvm_x86_ops->patch_hypercall(vcpu, instruction); - if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu) + if (emulator_write_emulated(rip, instruction, 3, vcpu) != X86EMUL_CONTINUE) ret = -EFAULT; @@ -2700,13 +2717,12 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) u32 function, index; struct kvm_cpuid_entry2 *e, *best; - kvm_x86_ops->cache_regs(vcpu); - function = vcpu->arch.regs[VCPU_REGS_RAX]; - index = vcpu->arch.regs[VCPU_REGS_RCX]; - vcpu->arch.regs[VCPU_REGS_RAX] = 0; - vcpu->arch.regs[VCPU_REGS_RBX] = 0; - vcpu->arch.regs[VCPU_REGS_RCX] = 0; - vcpu->arch.regs[VCPU_REGS_RDX] = 0; + function = kvm_register_read(vcpu, VCPU_REGS_RAX); + index = kvm_register_read(vcpu, VCPU_REGS_RCX); + kvm_register_write(vcpu, VCPU_REGS_RAX, 0); + kvm_register_write(vcpu, VCPU_REGS_RBX, 0); + kvm_register_write(vcpu, VCPU_REGS_RCX, 0); + kvm_register_write(vcpu, VCPU_REGS_RDX, 0); best = NULL; for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { e = &vcpu->arch.cpuid_entries[i]; @@ -2724,18 +2740,17 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) best = e; } if (best) { - vcpu->arch.regs[VCPU_REGS_RAX] = best->eax; - vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx; - vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx; - vcpu->arch.regs[VCPU_REGS_RDX] = best->edx; + kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); + kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); + kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); + kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); } - kvm_x86_ops->decache_regs(vcpu); kvm_x86_ops->skip_emulated_instruction(vcpu); KVMTRACE_5D(CPUID, vcpu, function, - (u32)vcpu->arch.regs[VCPU_REGS_RAX], - (u32)vcpu->arch.regs[VCPU_REGS_RBX], - (u32)vcpu->arch.regs[VCPU_REGS_RCX], - (u32)vcpu->arch.regs[VCPU_REGS_RDX], handler); + (u32)kvm_register_read(vcpu, VCPU_REGS_RAX), + (u32)kvm_register_read(vcpu, VCPU_REGS_RBX), + (u32)kvm_register_read(vcpu, VCPU_REGS_RCX), + (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler); } EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); @@ -2917,8 +2932,8 @@ again: * Profile KVM exit RIPs: */ if (unlikely(prof_on == KVM_PROFILING)) { - kvm_x86_ops->cache_regs(vcpu); - profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip); + unsigned long rip = kvm_rip_read(vcpu); + profile_hit(KVM_PROFILING, (void *)rip); } if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu)) @@ -2999,11 +3014,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } } #endif - if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { - kvm_x86_ops->cache_regs(vcpu); - vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; - kvm_x86_ops->decache_regs(vcpu); - } + if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) + kvm_register_write(vcpu, VCPU_REGS_RAX, + kvm_run->hypercall.ret); r = __vcpu_run(vcpu, kvm_run); @@ -3019,28 +3032,26 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { vcpu_load(vcpu); - kvm_x86_ops->cache_regs(vcpu); - - regs->rax = vcpu->arch.regs[VCPU_REGS_RAX]; - regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX]; - regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX]; - regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX]; - regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI]; - regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI]; - regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP]; - regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP]; + regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); + regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); + regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); + regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX); + regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI); + regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI); + regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); + regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP); #ifdef CONFIG_X86_64 - regs->r8 = vcpu->arch.regs[VCPU_REGS_R8]; - regs->r9 = vcpu->arch.regs[VCPU_REGS_R9]; - regs->r10 = vcpu->arch.regs[VCPU_REGS_R10]; - regs->r11 = vcpu->arch.regs[VCPU_REGS_R11]; - regs->r12 = vcpu->arch.regs[VCPU_REGS_R12]; - regs->r13 = vcpu->arch.regs[VCPU_REGS_R13]; - regs->r14 = vcpu->arch.regs[VCPU_REGS_R14]; - regs->r15 = vcpu->arch.regs[VCPU_REGS_R15]; + regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8); + regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9); + regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10); + regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11); + regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12); + regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13); + regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14); + regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15); #endif - regs->rip = vcpu->arch.rip; + regs->rip = kvm_rip_read(vcpu); regs->rflags = kvm_x86_ops->get_rflags(vcpu); /* @@ -3058,29 +3069,29 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { vcpu_load(vcpu); - vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax; - vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx; - vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx; - vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx; - vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi; - vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi; - vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp; - vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp; + kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); + kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); + kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); + kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx); + kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi); + kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi); + kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp); + kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp); #ifdef CONFIG_X86_64 - vcpu->arch.regs[VCPU_REGS_R8] = regs->r8; - vcpu->arch.regs[VCPU_REGS_R9] = regs->r9; - vcpu->arch.regs[VCPU_REGS_R10] = regs->r10; - vcpu->arch.regs[VCPU_REGS_R11] = regs->r11; - vcpu->arch.regs[VCPU_REGS_R12] = regs->r12; - vcpu->arch.regs[VCPU_REGS_R13] = regs->r13; - vcpu->arch.regs[VCPU_REGS_R14] = regs->r14; - vcpu->arch.regs[VCPU_REGS_R15] = regs->r15; + kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8); + kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9); + kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10); + kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11); + kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12); + kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); + kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); + kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); + #endif - vcpu->arch.rip = regs->rip; + kvm_rip_write(vcpu, regs->rip); kvm_x86_ops->set_rflags(vcpu, regs->rflags); - kvm_x86_ops->decache_regs(vcpu); vcpu->arch.exception.pending = false; @@ -3316,17 +3327,16 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu, struct tss_segment_32 *tss) { tss->cr3 = vcpu->arch.cr3; - tss->eip = vcpu->arch.rip; + tss->eip = kvm_rip_read(vcpu); tss->eflags = kvm_x86_ops->get_rflags(vcpu); - tss->eax = vcpu->arch.regs[VCPU_REGS_RAX]; - tss->ecx = vcpu->arch.regs[VCPU_REGS_RCX]; - tss->edx = vcpu->arch.regs[VCPU_REGS_RDX]; - tss->ebx = vcpu->arch.regs[VCPU_REGS_RBX]; - tss->esp = vcpu->arch.regs[VCPU_REGS_RSP]; - tss->ebp = vcpu->arch.regs[VCPU_REGS_RBP]; - tss->esi = vcpu->arch.regs[VCPU_REGS_RSI]; - tss->edi = vcpu->arch.regs[VCPU_REGS_RDI]; - + tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); + tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); + tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); + tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX); + tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP); + tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP); + tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI); + tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI); tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); @@ -3342,17 +3352,17 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu, { kvm_set_cr3(vcpu, tss->cr3); - vcpu->arch.rip = tss->eip; + kvm_rip_write(vcpu, tss->eip); kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); - vcpu->arch.regs[VCPU_REGS_RAX] = tss->eax; - vcpu->arch.regs[VCPU_REGS_RCX] = tss->ecx; - vcpu->arch.regs[VCPU_REGS_RDX] = tss->edx; - vcpu->arch.regs[VCPU_REGS_RBX] = tss->ebx; - vcpu->arch.regs[VCPU_REGS_RSP] = tss->esp; - vcpu->arch.regs[VCPU_REGS_RBP] = tss->ebp; - vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi; - vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi; + kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); + kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); + kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx); + kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx); + kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp); + kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp); + kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); + kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) return 1; @@ -3380,16 +3390,16 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu, static void save_state_to_tss16(struct kvm_vcpu *vcpu, struct tss_segment_16 *tss) { - tss->ip = vcpu->arch.rip; + tss->ip = kvm_rip_read(vcpu); tss->flag = kvm_x86_ops->get_rflags(vcpu); - tss->ax = vcpu->arch.regs[VCPU_REGS_RAX]; - tss->cx = vcpu->arch.regs[VCPU_REGS_RCX]; - tss->dx = vcpu->arch.regs[VCPU_REGS_RDX]; - tss->bx = vcpu->arch.regs[VCPU_REGS_RBX]; - tss->sp = vcpu->arch.regs[VCPU_REGS_RSP]; - tss->bp = vcpu->arch.regs[VCPU_REGS_RBP]; - tss->si = vcpu->arch.regs[VCPU_REGS_RSI]; - tss->di = vcpu->arch.regs[VCPU_REGS_RDI]; + tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); + tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); + tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); + tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX); + tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP); + tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP); + tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI); + tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI); tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); @@ -3402,16 +3412,16 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu, static int load_state_from_tss16(struct kvm_vcpu *vcpu, struct tss_segment_16 *tss) { - vcpu->arch.rip = tss->ip; + kvm_rip_write(vcpu, tss->ip); kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); - vcpu->arch.regs[VCPU_REGS_RAX] = tss->ax; - vcpu->arch.regs[VCPU_REGS_RCX] = tss->cx; - vcpu->arch.regs[VCPU_REGS_RDX] = tss->dx; - vcpu->arch.regs[VCPU_REGS_RBX] = tss->bx; - vcpu->arch.regs[VCPU_REGS_RSP] = tss->sp; - vcpu->arch.regs[VCPU_REGS_RBP] = tss->bp; - vcpu->arch.regs[VCPU_REGS_RSI] = tss->si; - vcpu->arch.regs[VCPU_REGS_RDI] = tss->di; + kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); + kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); + kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); + kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx); + kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp); + kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp); + kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); + kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) return 1; @@ -3534,7 +3544,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) } kvm_x86_ops->skip_emulated_instruction(vcpu); - kvm_x86_ops->cache_regs(vcpu); if (nseg_desc.type & 8) ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base, @@ -3559,7 +3568,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) tr_seg.type = 11; kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); out: - kvm_x86_ops->decache_regs(vcpu); return ret; } EXPORT_SYMBOL_GPL(kvm_task_switch); diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index f2f90468f8b..d5da7f14d53 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -26,6 +26,7 @@ #define DPRINTF(_f, _a ...) printf(_f , ## _a) #else #include +#include "kvm_cache_regs.h" #define DPRINTF(x...) do {} while (0) #endif #include @@ -839,7 +840,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) /* Shadow copy of register state. Committed on successful emulation. */ memset(c, 0, sizeof(struct decode_cache)); - c->eip = ctxt->vcpu->arch.rip; + c->eip = kvm_rip_read(ctxt->vcpu); ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); @@ -1267,7 +1268,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) if (c->rep_prefix && (c->d & String)) { /* All REP prefixes have the same first termination condition */ if (c->regs[VCPU_REGS_RCX] == 0) { - ctxt->vcpu->arch.rip = c->eip; + kvm_rip_write(ctxt->vcpu, c->eip); goto done; } /* The second termination condition only applies for REPE @@ -1281,17 +1282,17 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) (c->b == 0xae) || (c->b == 0xaf)) { if ((c->rep_prefix == REPE_PREFIX) && ((ctxt->eflags & EFLG_ZF) == 0)) { - ctxt->vcpu->arch.rip = c->eip; + kvm_rip_write(ctxt->vcpu, c->eip); goto done; } if ((c->rep_prefix == REPNE_PREFIX) && ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { - ctxt->vcpu->arch.rip = c->eip; + kvm_rip_write(ctxt->vcpu, c->eip); goto done; } } c->regs[VCPU_REGS_RCX]--; - c->eip = ctxt->vcpu->arch.rip; + c->eip = kvm_rip_read(ctxt->vcpu); } if (c->src.type == OP_MEM) { @@ -1768,7 +1769,7 @@ writeback: /* Commit shadow register state. */ memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); - ctxt->vcpu->arch.rip = c->eip; + kvm_rip_write(ctxt->vcpu, c->eip); done: if (rc == X86EMUL_UNHANDLEABLE) { @@ -1793,7 +1794,7 @@ twobyte_insn: goto done; /* Let the processor re-execute the fixed hypercall */ - c->eip = ctxt->vcpu->arch.rip; + c->eip = kvm_rip_read(ctxt->vcpu); /* Disable writeback. */ c->dst.type = OP_NONE; break; @@ -1889,7 +1890,7 @@ twobyte_insn: rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); if (rc) { kvm_inject_gp(ctxt->vcpu, 0); - c->eip = ctxt->vcpu->arch.rip; + c->eip = kvm_rip_read(ctxt->vcpu); } rc = X86EMUL_CONTINUE; c->dst.type = OP_NONE; @@ -1899,7 +1900,7 @@ twobyte_insn: rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); if (rc) { kvm_inject_gp(ctxt->vcpu, 0); - c->eip = ctxt->vcpu->arch.rip; + c->eip = kvm_rip_read(ctxt->vcpu); } else { c->regs[VCPU_REGS_RAX] = (u32)msr_data; c->regs[VCPU_REGS_RDX] = msr_data >> 32; diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 69794547f51..8c886be0103 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -89,7 +89,7 @@ extern struct list_head vm_list; struct kvm_vcpu; struct kvm; -enum { +enum kvm_reg { VCPU_REGS_RAX = 0, VCPU_REGS_RCX = 1, VCPU_REGS_RDX = 2, @@ -108,6 +108,7 @@ enum { VCPU_REGS_R14 = 14, VCPU_REGS_R15 = 15, #endif + VCPU_REGS_RIP, NR_VCPU_REGS }; @@ -219,8 +220,13 @@ struct kvm_vcpu_arch { int interrupt_window_open; unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS); - unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */ - unsigned long rip; /* needs vcpu_load_rsp_rip() */ + /* + * rip and regs accesses must go through + * kvm_{register,rip}_{read,write} functions. + */ + unsigned long regs[NR_VCPU_REGS]; + u32 regs_avail; + u32 regs_dirty; unsigned long cr0; unsigned long cr2; @@ -414,8 +420,7 @@ struct kvm_x86_ops { unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr); void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value, int *exception); - void (*cache_regs)(struct kvm_vcpu *vcpu); - void (*decache_regs)(struct kvm_vcpu *vcpu); + void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); -- cgit v1.2.3-70-g09d2 From 867767a365ee74a3adcfaba27075eefb66b14bfd Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Fri, 27 Jun 2008 15:55:02 +0300 Subject: KVM: Introduce kvm_set_irq to inject interrupts in guests This function injects an interrupt into the guest given the kvm struct, the (guest) irq number and the interrupt level. Signed-off-by: Amit Shah Signed-off-by: Avi Kivity --- arch/x86/kvm/irq.c | 11 +++++++++++ arch/x86/kvm/irq.h | 2 ++ 2 files changed, 13 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 76d736b5f66..0d9e55275af 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -100,3 +100,14 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu) __kvm_migrate_apic_timer(vcpu); __kvm_migrate_pit_timer(vcpu); } + +/* This should be called with the kvm->lock mutex held */ +void kvm_set_irq(struct kvm *kvm, int irq, int level) +{ + /* Not possible to detect if the guest uses the PIC or the + * IOAPIC. So set the bit in both. The guest will ignore + * writes to the unused one. + */ + kvm_ioapic_set_irq(kvm->arch.vioapic, irq, level); + kvm_pic_set_irq(pic_irqchip(kvm), irq, level); +} diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 7ca47cbb48b..07ff2aef0c1 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -82,6 +82,8 @@ static inline int irqchip_in_kernel(struct kvm *kvm) void kvm_pic_reset(struct kvm_kpic_state *s); +void kvm_set_irq(struct kvm *kvm, int irq, int level); + void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); -- cgit v1.2.3-70-g09d2 From 31aa2b44afd5e73365221b1de66f6081e4616f33 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Fri, 11 Jul 2008 17:59:46 +0300 Subject: KVM: MMU: Separate the code for unlinking a shadow page from its parents Place into own function, in preparation for further cleanups. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3da2508eb22..81016a3a6fd 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -991,11 +991,10 @@ static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) kvm->vcpus[i]->arch.last_pte_updated = NULL; } -static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) +static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) { u64 *parent_pte; - ++kvm->stat.mmu_shadow_zapped; while (sp->multimapped || sp->parent_pte) { if (!sp->multimapped) parent_pte = sp->parent_pte; @@ -1010,7 +1009,13 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_put_page(sp, parent_pte); set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); } +} + +static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + ++kvm->stat.mmu_shadow_zapped; kvm_mmu_page_unlink_children(kvm, sp); + kvm_mmu_unlink_parents(kvm, sp); if (!sp->root_count) { if (!sp->role.metaphysical && !sp->role.invalid) unaccount_shadowed(kvm, sp->gfn); -- cgit v1.2.3-70-g09d2 From 5b5c6a5a60801effb559e787a947885d9850a7da Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Fri, 11 Jul 2008 18:07:26 +0300 Subject: KVM: MMU: Simplify kvm_mmu_zap_page() The twisty maze of conditionals can be reduced. [joerg: fix tlb flushing] Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 81016a3a6fd..c3afbfe6b0c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -955,7 +955,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, rmap_remove(kvm, &pt[i]); pt[i] = shadow_trap_nonpresent_pte; } - kvm_flush_remote_tlbs(kvm); return; } @@ -974,7 +973,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, } pt[i] = shadow_trap_nonpresent_pte; } - kvm_flush_remote_tlbs(kvm); } static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) @@ -1016,18 +1014,16 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) ++kvm->stat.mmu_shadow_zapped; kvm_mmu_page_unlink_children(kvm, sp); kvm_mmu_unlink_parents(kvm, sp); + kvm_flush_remote_tlbs(kvm); + if (!sp->role.invalid && !sp->role.metaphysical) + unaccount_shadowed(kvm, sp->gfn); if (!sp->root_count) { - if (!sp->role.metaphysical && !sp->role.invalid) - unaccount_shadowed(kvm, sp->gfn); hlist_del(&sp->hash_link); kvm_mmu_free_page(kvm, sp); } else { - int invalid = sp->role.invalid; - list_move(&sp->link, &kvm->arch.active_mmu_pages); sp->role.invalid = 1; + list_move(&sp->link, &kvm->arch.active_mmu_pages); kvm_reload_remote_mmus(kvm); - if (!sp->role.metaphysical && !invalid) - unaccount_shadowed(kvm, sp->gfn); } kvm_mmu_reset_last_pte_updated(kvm); } @@ -1842,7 +1838,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { - if (sp->gfn != gfn || sp->role.metaphysical) + if (sp->gfn != gfn || sp->role.metaphysical || sp->role.invalid) continue; pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); -- cgit v1.2.3-70-g09d2 From cf393f75661f4b17451377b353833eb5502a9688 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 1 Jul 2008 16:20:21 +0300 Subject: KVM: Move NMI IRET fault processing to new vmx_complete_interrupts() Currently most interrupt exit processing is handled on the entry path, which is confusing. Move the NMI IRET fault processing to a new function, vmx_complete_interrupts(), which is called on the vmexit path. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5a3a0326c27..2adfacb0033 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2817,6 +2817,27 @@ static void enable_intr_window(struct kvm_vcpu *vcpu) enable_irq_window(vcpu); } +static void vmx_complete_interrupts(struct vcpu_vmx *vmx) +{ + u32 exit_intr_info; + bool unblock_nmi; + u8 vector; + + exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + if (cpu_has_virtual_nmis()) { + unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; + vector = exit_intr_info & INTR_INFO_VECTOR_MASK; + /* + * SDM 3: 25.7.1.2 + * Re-set bit "block by NMI" before VM entry if vmexit caused by + * a guest IRET fault. + */ + if (unblock_nmi && vector != DF_VECTOR) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + } +} + static void vmx_intr_assist(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -2873,23 +2894,12 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) return; } if (cpu_has_virtual_nmis()) { - /* - * SDM 3: 25.7.1.2 - * Re-set bit "block by NMI" before VM entry if vmexit caused by - * a guest IRET fault. - */ - if ((exit_intr_info_field & INTR_INFO_UNBLOCK_NMI) && - (exit_intr_info_field & INTR_INFO_VECTOR_MASK) != 8) - vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, - vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) | - GUEST_INTR_STATE_NMI); - else if (vcpu->arch.nmi_pending) { + if (vcpu->arch.nmi_pending) { if (vmx_nmi_enabled(vcpu)) vmx_inject_nmi(vcpu); enable_intr_window(vcpu); return; } - } if (!kvm_cpu_has_interrupt(vcpu)) return; @@ -3076,6 +3086,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) KVMTRACE_0D(NMI, vcpu, handler); asm("int $2"); } + + vmx_complete_interrupts(vmx); } static void vmx_free_vmcs(struct kvm_vcpu *vcpu) -- cgit v1.2.3-70-g09d2 From 668f612fa0d8d4120ec5dc0725d7e1ca3152a954 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 2 Jul 2008 09:28:55 +0300 Subject: KVM: VMX: Move nmi injection failure processing to vm exit path Instead of processing nmi injection failure in the vm entry path, move it to the vm exit path (vm_complete_interrupts()). This separates nmi injection from nmi post-processing, and moves the nmi state from the VT state into vcpu state (new variable nmi_injected specifying an injection in progress). Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 49 ++++++++++++++++++++++++++++++---------------- include/asm-x86/kvm_host.h | 1 + 2 files changed, 33 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2adfacb0033..ce13b53d21c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2151,7 +2151,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) { vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); - vcpu->arch.nmi_pending = 0; } static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) @@ -2820,8 +2819,11 @@ static void enable_intr_window(struct kvm_vcpu *vcpu) static void vmx_complete_interrupts(struct vcpu_vmx *vmx) { u32 exit_intr_info; + u32 idt_vectoring_info; bool unblock_nmi; u8 vector; + int type; + bool idtv_info_valid; exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); if (cpu_has_virtual_nmis()) { @@ -2836,18 +2838,34 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); } + + idt_vectoring_info = vmx->idt_vectoring_info; + idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; + vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; + type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; + if (vmx->vcpu.arch.nmi_injected) { + /* + * SDM 3: 25.7.1.2 + * Clear bit "block by NMI" before VM entry if a NMI delivery + * faulted. + */ + if (idtv_info_valid && type == INTR_TYPE_NMI_INTR) + vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + else + vmx->vcpu.arch.nmi_injected = false; + } } static void vmx_intr_assist(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 idtv_info_field, intr_info_field, exit_intr_info_field; + u32 idtv_info_field, intr_info_field; int vector; update_tpr_threshold(vcpu); intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); - exit_intr_info_field = vmcs_read32(VM_EXIT_INTR_INFO); idtv_info_field = vmx->idt_vectoring_info; if (intr_info_field & INTR_INFO_VALID_MASK) { if (idtv_info_field & INTR_INFO_VALID_MASK) { @@ -2871,17 +2889,6 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler); - /* - * SDM 3: 25.7.1.2 - * Clear bit "block by NMI" before VM entry if a NMI delivery - * faulted. - */ - if ((idtv_info_field & VECTORING_INFO_TYPE_MASK) - == INTR_TYPE_NMI_INTR && cpu_has_virtual_nmis()) - vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, - vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & - ~GUEST_INTR_STATE_NMI); - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field & ~INTR_INFO_RESVD_BITS_MASK); vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, @@ -2894,9 +2901,17 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) return; } if (cpu_has_virtual_nmis()) { - if (vcpu->arch.nmi_pending) { - if (vmx_nmi_enabled(vcpu)) - vmx_inject_nmi(vcpu); + if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { + if (vmx_nmi_enabled(vcpu)) { + vcpu->arch.nmi_pending = false; + vcpu->arch.nmi_injected = true; + } else { + enable_intr_window(vcpu); + return; + } + } + if (vcpu->arch.nmi_injected) { + vmx_inject_nmi(vcpu); enable_intr_window(vcpu); return; } diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 4f2bd884fd3..7cf69fd1dcf 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -300,6 +300,7 @@ struct kvm_vcpu_arch { struct page *time_page; bool nmi_pending; + bool nmi_injected; u64 mtrr[0x100]; }; -- cgit v1.2.3-70-g09d2 From 26eef70c3e8c76e73dff2579c792fc7355f8a291 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 3 Jul 2008 14:59:22 +0300 Subject: KVM: Clear exception queue before emulating an instruction If we're emulating an instruction, either it will succeed, in which case any previously queued exception will be spurious, or we will requeue the same exception. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 ++ arch/x86/kvm/x86.h | 11 +++++++++++ 2 files changed, 13 insertions(+) create mode 100644 arch/x86/kvm/x86.h (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2f0696bc7d2..5620df2685d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -20,6 +20,7 @@ #include "i8254.h" #include "tss.h" #include "kvm_cache_regs.h" +#include "x86.h" #include #include @@ -2121,6 +2122,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu, int r; struct decode_cache *c; + kvm_clear_exception_queue(vcpu); vcpu->arch.mmio_fault_cr2 = cr2; /* * TODO: fix x86_emulate.c to use guest_read/write_register diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h new file mode 100644 index 00000000000..c666649c4bb --- /dev/null +++ b/arch/x86/kvm/x86.h @@ -0,0 +1,11 @@ +#ifndef ARCH_X86_KVM_X86_H +#define ARCH_X86_KVM_X86_H + +#include + +static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) +{ + vcpu->arch.exception.pending = false; +} + +#endif -- cgit v1.2.3-70-g09d2 From 35920a356957eea9fd1f9da043f93469e8d72eab Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 3 Jul 2008 14:50:12 +0300 Subject: KVM: VMX: Fix pending exception processing The vmx code assumes that IDT-Vectoring can only be set when an exception is injected due to the exception in question. That's not true, however: if the exception is injected correctly, and later another exception occurs but its delivery is blocked due to a fault, then we will incorrectly assume the first exception was not delivered. Fix by unconditionally dequeuing the pending exception, and requeuing it (or the second exception) if we see it in the IDT-Vectoring field. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ce13b53d21c..ed4fe8e72ad 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -27,6 +27,7 @@ #include #include #include "kvm_cache_regs.h" +#include "x86.h" #include #include @@ -744,9 +745,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, static bool vmx_exception_injected(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - - return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); + return false; } /* @@ -2824,6 +2823,7 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) u8 vector; int type; bool idtv_info_valid; + u32 error; exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); if (cpu_has_virtual_nmis()) { @@ -2855,6 +2855,15 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) else vmx->vcpu.arch.nmi_injected = false; } + kvm_clear_exception_queue(&vmx->vcpu); + if (idtv_info_valid && type == INTR_TYPE_EXCEPTION) { + if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { + error = vmcs_read32(IDT_VECTORING_ERROR_CODE); + kvm_queue_exception_e(&vmx->vcpu, vector, error); + } else + kvm_queue_exception(&vmx->vcpu, vector); + vmx->idt_vectoring_info = 0; + } } static void vmx_intr_assist(struct kvm_vcpu *vcpu) -- cgit v1.2.3-70-g09d2 From 937a7eaef9f08342958d17055a350982b7bd92cb Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 3 Jul 2008 15:17:01 +0300 Subject: KVM: Add a pending interrupt queue Similar to the exception queue, this hold interrupts that have been accepted by the virtual processor core but not yet injected. Not yet used. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.h | 11 +++++++++++ include/asm-x86/kvm_host.h | 5 +++++ 2 files changed, 16 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index c666649c4bb..6a4be78a738 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -8,4 +8,15 @@ static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) vcpu->arch.exception.pending = false; } +static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector) +{ + vcpu->arch.interrupt.pending = true; + vcpu->arch.interrupt.nr = vector; +} + +static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu) +{ + vcpu->arch.interrupt.pending = false; +} + #endif diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 7cf69fd1dcf..65c6a0e5b73 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -275,6 +275,11 @@ struct kvm_vcpu_arch { u32 error_code; } exception; + struct kvm_queued_interrupt { + bool pending; + u8 nr; + } interrupt; + struct { int active; u8 save_iopl; -- cgit v1.2.3-70-g09d2 From f7d9238f5dc9306fd3bf1653c2939eef72f94d06 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 3 Jul 2008 16:14:28 +0300 Subject: KVM: VMX: Move interrupt post-processing to vmx_complete_interrupts() Instead of looking at failed injections in the vm entry path, move processing to the exit path in vmx_complete_interrupts(). This simplifes the logic and removes any state that is hidden in vmx registers. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 77 ++++++++++++++---------------------------------------- 1 file changed, 20 insertions(+), 57 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ed4fe8e72ad..e8fed9b8756 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1002,17 +1002,9 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) static int vmx_get_irq(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 idtv_info_field; - - idtv_info_field = vmx->idt_vectoring_info; - if (idtv_info_field & INTR_INFO_VALID_MASK) { - if (is_external_interrupt(idtv_info_field)) - return idtv_info_field & VECTORING_INFO_VECTOR_MASK; - else - printk(KERN_DEBUG "pending exception: not handled yet\n"); - } - return -1; + if (!vcpu->arch.interrupt.pending) + return -1; + return vcpu->arch.interrupt.nr; } static __init int cpu_has_kvm_support(void) @@ -2293,7 +2285,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) cr2 = vmcs_readl(EXIT_QUALIFICATION); KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, (u32)((u64)cr2 >> 32), handler); - if (vect_info & VECTORING_INFO_VALID_MASK) + if (vcpu->arch.interrupt.pending || vcpu->arch.exception.pending) kvm_mmu_unprotect_page_virt(vcpu, cr2); return kvm_mmu_page_fault(vcpu, cr2, error_code); } @@ -2864,51 +2856,20 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) kvm_queue_exception(&vmx->vcpu, vector); vmx->idt_vectoring_info = 0; } + kvm_clear_interrupt_queue(&vmx->vcpu); + if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) { + kvm_queue_interrupt(&vmx->vcpu, vector); + vmx->idt_vectoring_info = 0; + } } static void vmx_intr_assist(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 idtv_info_field, intr_info_field; - int vector; + u32 intr_info_field; update_tpr_threshold(vcpu); intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); - idtv_info_field = vmx->idt_vectoring_info; - if (intr_info_field & INTR_INFO_VALID_MASK) { - if (idtv_info_field & INTR_INFO_VALID_MASK) { - /* TODO: fault when IDT_Vectoring */ - if (printk_ratelimit()) - printk(KERN_ERR "Fault when IDT_Vectoring\n"); - } - enable_intr_window(vcpu); - return; - } - if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) { - if ((idtv_info_field & VECTORING_INFO_TYPE_MASK) - == INTR_TYPE_EXT_INTR - && vcpu->arch.rmode.active) { - u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK; - - vmx_inject_irq(vcpu, vect); - enable_intr_window(vcpu); - return; - } - - KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler); - - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field - & ~INTR_INFO_RESVD_BITS_MASK); - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, - vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); - - if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK)) - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, - vmcs_read32(IDT_VECTORING_ERROR_CODE)); - enable_intr_window(vcpu); - return; - } if (cpu_has_virtual_nmis()) { if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { if (vmx_nmi_enabled(vcpu)) { @@ -2925,14 +2886,16 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) return; } } - if (!kvm_cpu_has_interrupt(vcpu)) - return; - if (vmx_irq_enabled(vcpu)) { - vector = kvm_cpu_get_interrupt(vcpu); - vmx_inject_irq(vcpu, vector); - kvm_timer_intr_post(vcpu, vector); - } else - enable_irq_window(vcpu); + if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) { + if (vmx_irq_enabled(vcpu)) + kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu)); + else + enable_irq_window(vcpu); + } + if (vcpu->arch.interrupt.pending) { + vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); + kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr); + } } /* -- cgit v1.2.3-70-g09d2 From 60bd83a125030878665684f353c7d36fd54f09fd Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Sat, 12 Jul 2008 16:02:08 +0300 Subject: KVM: VMX: Remove redundant check in handle_rmode_exception Since checking for vcpu->arch.rmode.active is already done whenever we call handle_rmode_exception(), checking it inside the function is redundant. Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e8fed9b8756..9591ca73191 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2224,9 +2224,6 @@ static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) static int handle_rmode_exception(struct kvm_vcpu *vcpu, int vec, u32 err_code) { - if (!vcpu->arch.rmode.active) - return 0; - /* * Instruction with address size override prefix opcode 0x67 * Cause the #SS fault with 0 error code in VM86 mode. -- cgit v1.2.3-70-g09d2 From 7edd0ce05892831c77fa4cebe24a6056d33336d5 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 7 Jul 2008 14:45:39 +0300 Subject: KVM: Consolidate PIC isr clearing into a function Signed-off-by: Avi Kivity --- arch/x86/kvm/i8259.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index c31164e8aa4..55e179ad98e 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -30,6 +30,11 @@ #include +static void pic_clear_isr(struct kvm_kpic_state *s, int irq) +{ + s->isr &= ~(1 << irq); +} + /* * set irq level. If an edge is detected, then the IRR is set to 1 */ @@ -141,11 +146,12 @@ void kvm_pic_set_irq(void *opaque, int irq, int level) */ static inline void pic_intack(struct kvm_kpic_state *s, int irq) { + s->isr |= 1 << irq; if (s->auto_eoi) { if (s->rotate_on_auto_eoi) s->priority_add = (irq + 1) & 7; - } else - s->isr |= (1 << irq); + pic_clear_isr(s, irq); + } /* * We don't clear a level sensitive interrupt here */ @@ -243,7 +249,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) priority = get_priority(s, s->isr); if (priority != 8) { irq = (priority + s->priority_add) & 7; - s->isr &= ~(1 << irq); + pic_clear_isr(s, irq); if (cmd == 5) s->priority_add = (irq + 1) & 7; pic_update_irq(s->pics_state); @@ -251,7 +257,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) break; case 3: irq = val & 7; - s->isr &= ~(1 << irq); + pic_clear_isr(s, irq); pic_update_irq(s->pics_state); break; case 6: @@ -260,8 +266,8 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) break; case 7: irq = val & 7; - s->isr &= ~(1 << irq); s->priority_add = (irq + 1) & 7; + pic_clear_isr(s, irq); pic_update_irq(s->pics_state); break; default: @@ -303,7 +309,7 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1) s->pics_state->pics[0].irr &= ~(1 << 2); } s->irr &= ~(1 << ret); - s->isr &= ~(1 << ret); + pic_clear_isr(s, ret); if (addr1 >> 7 || ret != 2) pic_update_irq(s->pics_state); } else { -- cgit v1.2.3-70-g09d2 From 19bd8afdc4e6fbb47e4841f8d771f8cb29916d9f Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sun, 13 Jul 2008 13:40:55 +0200 Subject: KVM: Consolidate XX_VECTOR defines Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 4 ---- arch/x86/kvm/vmx.c | 2 +- include/asm-x86/kvm_host.h | 1 + 3 files changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 54b0bf33e21..743c98d135c 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -36,10 +36,6 @@ MODULE_LICENSE("GPL"); #define IOPM_ALLOC_ORDER 2 #define MSRPM_ALLOC_ORDER 1 -#define DB_VECTOR 1 -#define UD_VECTOR 6 -#define GP_VECTOR 13 - #define DR7_GD_MASK (1 << 13) #define DR6_BD_MASK (1 << 13) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9591ca73191..5c9dac567a7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -470,7 +470,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) if (!vcpu->fpu_active) eb |= 1u << NM_VECTOR; if (vcpu->guest_debug.enabled) - eb |= 1u << 1; + eb |= 1u << DB_VECTOR; if (vcpu->arch.rmode.active) eb = ~0; if (vm_need_ept()) diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 65c6a0e5b73..da3c1979b73 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -57,6 +57,7 @@ #define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE) #define DE_VECTOR 0 +#define DB_VECTOR 1 #define UD_VECTOR 6 #define NM_VECTOR 7 #define DF_VECTOR 8 -- cgit v1.2.3-70-g09d2 From 77ab6db0a1c403387b403e9351ab3f5ae1df83e6 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 14 Jul 2008 12:28:51 +0200 Subject: KVM: VMX: Reinject real mode exception As we execute real mode guests in VM86 mode, exception have to be reinjected appropriately when the guest triggered them. For this purpose the patch adopts the real-mode injection pattern used in vmx_inject_irq to vmx_queue_exception, additionally taking care that the IP is set correctly for #BP exceptions. Furthermore it extends handle_rmode_exception to reinject all those exceptions that can be raised in real mode. This fixes the execution of himem.exe from FreeDOS and also makes its debug.com work properly. Note that guest debugging in real mode is broken now. This has to be fixed by the scheduled debugging infrastructure rework (will be done once base patches for QEMU have been accepted). Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 41 +++++++++++++++++++++++++++++++++++++++-- include/asm-x86/kvm_host.h | 4 ++++ 2 files changed, 43 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5c9dac567a7..2879880076d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -735,12 +735,30 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, bool has_error_code, u32 error_code) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (has_error_code) + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); + + if (vcpu->arch.rmode.active) { + vmx->rmode.irq.pending = true; + vmx->rmode.irq.vector = nr; + vmx->rmode.irq.rip = kvm_rip_read(vcpu); + if (nr == BP_VECTOR) + vmx->rmode.irq.rip++; + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + nr | INTR_TYPE_SOFT_INTR + | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0) + | INTR_INFO_VALID_MASK); + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); + kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); + return; + } + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, nr | INTR_TYPE_EXCEPTION | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0) | INTR_INFO_VALID_MASK); - if (has_error_code) - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); } static bool vmx_exception_injected(struct kvm_vcpu *vcpu) @@ -2231,6 +2249,25 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) return 1; + /* + * Forward all other exceptions that are valid in real mode. + * FIXME: Breaks guest debugging in real mode, needs to be fixed with + * the required debugging infrastructure rework. + */ + switch (vec) { + case DE_VECTOR: + case DB_VECTOR: + case BP_VECTOR: + case OF_VECTOR: + case BR_VECTOR: + case UD_VECTOR: + case DF_VECTOR: + case SS_VECTOR: + case GP_VECTOR: + case MF_VECTOR: + kvm_queue_exception(vcpu, vec); + return 1; + } return 0; } diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index da3c1979b73..83afa10c77f 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -58,6 +58,9 @@ #define DE_VECTOR 0 #define DB_VECTOR 1 +#define BP_VECTOR 3 +#define OF_VECTOR 4 +#define BR_VECTOR 5 #define UD_VECTOR 6 #define NM_VECTOR 7 #define DF_VECTOR 8 @@ -66,6 +69,7 @@ #define SS_VECTOR 12 #define GP_VECTOR 13 #define PF_VECTOR 14 +#define MF_VECTOR 16 #define MC_VECTOR 18 #define SELECTOR_TI_MASK (1 << 2) -- cgit v1.2.3-70-g09d2 From c801949ddf0a51074937f488a52072825ed50174 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 14 Jul 2008 14:44:59 +0300 Subject: KVM: VMX: Unify register save/restore across 32 and 64 bit hosts Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 90 ++++++++++++++++++++++-------------------------------- 1 file changed, 36 insertions(+), 54 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2879880076d..fc8996b1043 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2955,6 +2955,14 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx) | vmx->rmode.irq.vector; } +#ifdef CONFIG_X86_64 +#define R "r" +#define Q "q" +#else +#define R "e" +#define Q "l" +#endif + static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -2972,26 +2980,21 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) asm( /* Store host registers */ -#ifdef CONFIG_X86_64 - "push %%rdx; push %%rbp;" - "push %%rcx \n\t" -#else - "push %%edx; push %%ebp;" - "push %%ecx \n\t" -#endif + "push %%"R"dx; push %%"R"bp;" + "push %%"R"cx \n\t" __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" /* Check if vmlaunch of vmresume is needed */ "cmpl $0, %c[launched](%0) \n\t" /* Load guest registers. Don't clobber flags. */ + "mov %c[cr2](%0), %%"R"ax \n\t" + "mov %%"R"ax, %%cr2 \n\t" + "mov %c[rax](%0), %%"R"ax \n\t" + "mov %c[rbx](%0), %%"R"bx \n\t" + "mov %c[rdx](%0), %%"R"dx \n\t" + "mov %c[rsi](%0), %%"R"si \n\t" + "mov %c[rdi](%0), %%"R"di \n\t" + "mov %c[rbp](%0), %%"R"bp \n\t" #ifdef CONFIG_X86_64 - "mov %c[cr2](%0), %%rax \n\t" - "mov %%rax, %%cr2 \n\t" - "mov %c[rax](%0), %%rax \n\t" - "mov %c[rbx](%0), %%rbx \n\t" - "mov %c[rdx](%0), %%rdx \n\t" - "mov %c[rsi](%0), %%rsi \n\t" - "mov %c[rdi](%0), %%rdi \n\t" - "mov %c[rbp](%0), %%rbp \n\t" "mov %c[r8](%0), %%r8 \n\t" "mov %c[r9](%0), %%r9 \n\t" "mov %c[r10](%0), %%r10 \n\t" @@ -3000,18 +3003,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) "mov %c[r13](%0), %%r13 \n\t" "mov %c[r14](%0), %%r14 \n\t" "mov %c[r15](%0), %%r15 \n\t" - "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */ -#else - "mov %c[cr2](%0), %%eax \n\t" - "mov %%eax, %%cr2 \n\t" - "mov %c[rax](%0), %%eax \n\t" - "mov %c[rbx](%0), %%ebx \n\t" - "mov %c[rdx](%0), %%edx \n\t" - "mov %c[rsi](%0), %%esi \n\t" - "mov %c[rdi](%0), %%edi \n\t" - "mov %c[rbp](%0), %%ebp \n\t" - "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */ #endif + "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */ + /* Enter guest mode */ "jne .Llaunched \n\t" __ex(ASM_VMX_VMLAUNCH) "\n\t" @@ -3019,15 +3013,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" ".Lkvm_vmx_return: " /* Save guest registers, load host registers, keep flags */ + "xchg %0, (%%"R"sp) \n\t" + "mov %%"R"ax, %c[rax](%0) \n\t" + "mov %%"R"bx, %c[rbx](%0) \n\t" + "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t" + "mov %%"R"dx, %c[rdx](%0) \n\t" + "mov %%"R"si, %c[rsi](%0) \n\t" + "mov %%"R"di, %c[rdi](%0) \n\t" + "mov %%"R"bp, %c[rbp](%0) \n\t" #ifdef CONFIG_X86_64 - "xchg %0, (%%rsp) \n\t" - "mov %%rax, %c[rax](%0) \n\t" - "mov %%rbx, %c[rbx](%0) \n\t" - "pushq (%%rsp); popq %c[rcx](%0) \n\t" - "mov %%rdx, %c[rdx](%0) \n\t" - "mov %%rsi, %c[rsi](%0) \n\t" - "mov %%rdi, %c[rdi](%0) \n\t" - "mov %%rbp, %c[rbp](%0) \n\t" "mov %%r8, %c[r8](%0) \n\t" "mov %%r9, %c[r9](%0) \n\t" "mov %%r10, %c[r10](%0) \n\t" @@ -3036,24 +3030,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) "mov %%r13, %c[r13](%0) \n\t" "mov %%r14, %c[r14](%0) \n\t" "mov %%r15, %c[r15](%0) \n\t" - "mov %%cr2, %%rax \n\t" - "mov %%rax, %c[cr2](%0) \n\t" - - "pop %%rbp; pop %%rbp; pop %%rdx \n\t" -#else - "xchg %0, (%%esp) \n\t" - "mov %%eax, %c[rax](%0) \n\t" - "mov %%ebx, %c[rbx](%0) \n\t" - "pushl (%%esp); popl %c[rcx](%0) \n\t" - "mov %%edx, %c[rdx](%0) \n\t" - "mov %%esi, %c[rsi](%0) \n\t" - "mov %%edi, %c[rdi](%0) \n\t" - "mov %%ebp, %c[rbp](%0) \n\t" - "mov %%cr2, %%eax \n\t" - "mov %%eax, %c[cr2](%0) \n\t" - - "pop %%ebp; pop %%ebp; pop %%edx \n\t" #endif + "mov %%cr2, %%"R"ax \n\t" + "mov %%"R"ax, %c[cr2](%0) \n\t" + + "pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t" "setbe %c[fail](%0) \n\t" : : "c"(vmx), "d"((unsigned long)HOST_RSP), [launched]"i"(offsetof(struct vcpu_vmx, launched)), @@ -3077,11 +3058,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) #endif [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) : "cc", "memory" + , R"bx", R"di", R"si" #ifdef CONFIG_X86_64 - , "rbx", "rdi", "rsi" , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" -#else - , "ebx", "edi", "rsi" #endif ); @@ -3111,6 +3090,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vmx_complete_interrupts(vmx); } +#undef R +#undef Q + static void vmx_free_vmcs(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); -- cgit v1.2.3-70-g09d2 From 80e31d4f61f69d0e480ae092cda0e590d6a30aeb Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 14 Jul 2008 14:44:59 +0300 Subject: KVM: SVM: Unify register save/restore across 32 and 64 bit hosts Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 78 ++++++++++++++++++------------------------------------ 1 file changed, 26 insertions(+), 52 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 743c98d135c..179c2e00d0f 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1697,6 +1697,12 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; } +#ifdef CONFIG_X86_64 +#define R "r" +#else +#define R "e" +#endif + static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1735,19 +1741,14 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) local_irq_enable(); asm volatile ( + "push %%"R"bp; \n\t" + "mov %c[rbx](%[svm]), %%"R"bx \n\t" + "mov %c[rcx](%[svm]), %%"R"cx \n\t" + "mov %c[rdx](%[svm]), %%"R"dx \n\t" + "mov %c[rsi](%[svm]), %%"R"si \n\t" + "mov %c[rdi](%[svm]), %%"R"di \n\t" + "mov %c[rbp](%[svm]), %%"R"bp \n\t" #ifdef CONFIG_X86_64 - "push %%rbp; \n\t" -#else - "push %%ebp; \n\t" -#endif - -#ifdef CONFIG_X86_64 - "mov %c[rbx](%[svm]), %%rbx \n\t" - "mov %c[rcx](%[svm]), %%rcx \n\t" - "mov %c[rdx](%[svm]), %%rdx \n\t" - "mov %c[rsi](%[svm]), %%rsi \n\t" - "mov %c[rdi](%[svm]), %%rdi \n\t" - "mov %c[rbp](%[svm]), %%rbp \n\t" "mov %c[r8](%[svm]), %%r8 \n\t" "mov %c[r9](%[svm]), %%r9 \n\t" "mov %c[r10](%[svm]), %%r10 \n\t" @@ -1756,41 +1757,24 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) "mov %c[r13](%[svm]), %%r13 \n\t" "mov %c[r14](%[svm]), %%r14 \n\t" "mov %c[r15](%[svm]), %%r15 \n\t" -#else - "mov %c[rbx](%[svm]), %%ebx \n\t" - "mov %c[rcx](%[svm]), %%ecx \n\t" - "mov %c[rdx](%[svm]), %%edx \n\t" - "mov %c[rsi](%[svm]), %%esi \n\t" - "mov %c[rdi](%[svm]), %%edi \n\t" - "mov %c[rbp](%[svm]), %%ebp \n\t" #endif -#ifdef CONFIG_X86_64 /* Enter guest mode */ - "push %%rax \n\t" - "mov %c[vmcb](%[svm]), %%rax \n\t" + "push %%"R"ax \n\t" + "mov %c[vmcb](%[svm]), %%"R"ax \n\t" __ex(SVM_VMLOAD) "\n\t" __ex(SVM_VMRUN) "\n\t" __ex(SVM_VMSAVE) "\n\t" - "pop %%rax \n\t" -#else - /* Enter guest mode */ - "push %%eax \n\t" - "mov %c[vmcb](%[svm]), %%eax \n\t" - __ex(SVM_VMLOAD) "\n\t" - __ex(SVM_VMRUN) "\n\t" - __ex(SVM_VMSAVE) "\n\t" - "pop %%eax \n\t" -#endif + "pop %%"R"ax \n\t" /* Save guest registers, load host registers */ + "mov %%"R"bx, %c[rbx](%[svm]) \n\t" + "mov %%"R"cx, %c[rcx](%[svm]) \n\t" + "mov %%"R"dx, %c[rdx](%[svm]) \n\t" + "mov %%"R"si, %c[rsi](%[svm]) \n\t" + "mov %%"R"di, %c[rdi](%[svm]) \n\t" + "mov %%"R"bp, %c[rbp](%[svm]) \n\t" #ifdef CONFIG_X86_64 - "mov %%rbx, %c[rbx](%[svm]) \n\t" - "mov %%rcx, %c[rcx](%[svm]) \n\t" - "mov %%rdx, %c[rdx](%[svm]) \n\t" - "mov %%rsi, %c[rsi](%[svm]) \n\t" - "mov %%rdi, %c[rdi](%[svm]) \n\t" - "mov %%rbp, %c[rbp](%[svm]) \n\t" "mov %%r8, %c[r8](%[svm]) \n\t" "mov %%r9, %c[r9](%[svm]) \n\t" "mov %%r10, %c[r10](%[svm]) \n\t" @@ -1799,18 +1783,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) "mov %%r13, %c[r13](%[svm]) \n\t" "mov %%r14, %c[r14](%[svm]) \n\t" "mov %%r15, %c[r15](%[svm]) \n\t" - - "pop %%rbp; \n\t" -#else - "mov %%ebx, %c[rbx](%[svm]) \n\t" - "mov %%ecx, %c[rcx](%[svm]) \n\t" - "mov %%edx, %c[rdx](%[svm]) \n\t" - "mov %%esi, %c[rsi](%[svm]) \n\t" - "mov %%edi, %c[rdi](%[svm]) \n\t" - "mov %%ebp, %c[rbp](%[svm]) \n\t" - - "pop %%ebp; \n\t" #endif + "pop %%"R"bp" : : [svm]"a"(svm), [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), @@ -1831,11 +1805,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) #endif : "cc", "memory" + , R"bx", R"cx", R"dx", R"si", R"di" #ifdef CONFIG_X86_64 - , "rbx", "rcx", "rdx", "rsi", "rdi" , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" -#else - , "ebx", "ecx", "edx" , "esi", "edi" #endif ); @@ -1867,6 +1839,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) svm->next_rip = 0; } +#undef R + static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) { struct vcpu_svm *svm = to_svm(vcpu); -- cgit v1.2.3-70-g09d2 From 313dbd49dc239205b96da79fba09f7637cf84f3c Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 17 Jul 2008 18:04:30 +0300 Subject: KVM: VMX: Avoid vmwrite(HOST_RSP) when possible Usually HOST_RSP retains its value across guest entries. Take advantage of this and avoid a vmwrite() when this is so. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index fc8996b1043..ddb49e34697 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -58,6 +58,7 @@ struct vmcs { struct vcpu_vmx { struct kvm_vcpu vcpu; struct list_head local_vcpus_link; + unsigned long host_rsp; int launched; u8 fail; u32 idt_vectoring_info; @@ -2982,7 +2983,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) /* Store host registers */ "push %%"R"dx; push %%"R"bp;" "push %%"R"cx \n\t" + "cmp %%"R"sp, %c[host_rsp](%0) \n\t" + "je 1f \n\t" + "mov %%"R"sp, %c[host_rsp](%0) \n\t" __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" + "1: \n\t" /* Check if vmlaunch of vmresume is needed */ "cmpl $0, %c[launched](%0) \n\t" /* Load guest registers. Don't clobber flags. */ @@ -3039,6 +3044,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) : : "c"(vmx), "d"((unsigned long)HOST_RSP), [launched]"i"(offsetof(struct vcpu_vmx, launched)), [fail]"i"(offsetof(struct vcpu_vmx, fail)), + [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), -- cgit v1.2.3-70-g09d2 From b5e2fec0ebc3fcaff954092bb69444a67a904c0a Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Tue, 22 Jul 2008 08:00:45 +0200 Subject: KVM: Ignore DEBUGCTL MSRs with no effect Netware writes to DEBUGCTL and reads from the DEBUGCTL and LAST*IP MSRs without further checks and is really confused to receive a #GP during that. To make it happy we should just make them stubs, which is exactly what SVM already does. Writes to DEBUGCTL that are vendor-specific are resembled to behave as if the virtual CPU does not know them. Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5620df2685d..94a216562f1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -665,6 +665,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n", __func__, data); break; + case MSR_IA32_DEBUGCTLMSR: + if (!data) { + /* We support the non-activated case already */ + break; + } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) { + /* Values other than LBR and BTF are vendor-specific, + thus reserved and should throw a #GP */ + return 1; + } + pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", + __func__, data); + break; case MSR_IA32_UCODE_REV: case MSR_IA32_UCODE_WRITE: break; @@ -757,6 +769,11 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_MC0_MISC+16: case MSR_IA32_UCODE_REV: case MSR_IA32_EBL_CR_POWERON: + case MSR_IA32_DEBUGCTLMSR: + case MSR_IA32_LASTBRANCHFROMIP: + case MSR_IA32_LASTBRANCHTOIP: + case MSR_IA32_LASTINTFROMIP: + case MSR_IA32_LASTINTTOIP: data = 0; break; case MSR_MTRRcap: -- cgit v1.2.3-70-g09d2 From 564f15378f04921d5749f27ec53d5e68a6d1d446 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sat, 26 Jul 2008 17:00:59 -0300 Subject: KVM: Add irq ack notifier list This can be used by kvm subsystems that are interested in when interrupts are acked, for example time drift compensation. Signed-off-by: Avi Kivity --- arch/x86/kvm/irq.c | 22 ++++++++++++++++++++++ arch/x86/kvm/irq.h | 5 +++++ include/asm-x86/kvm_host.h | 7 +++++++ 3 files changed, 34 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 0d9e55275af..90911958d85 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -111,3 +111,25 @@ void kvm_set_irq(struct kvm *kvm, int irq, int level) kvm_ioapic_set_irq(kvm->arch.vioapic, irq, level); kvm_pic_set_irq(pic_irqchip(kvm), irq, level); } + +void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi) +{ + struct kvm_irq_ack_notifier *kian; + struct hlist_node *n; + + hlist_for_each_entry(kian, n, &kvm->arch.irq_ack_notifier_list, link) + if (kian->gsi == gsi) + kian->irq_acked(kian); +} + +void kvm_register_irq_ack_notifier(struct kvm *kvm, + struct kvm_irq_ack_notifier *kian) +{ + hlist_add_head(&kian->link, &kvm->arch.irq_ack_notifier_list); +} + +void kvm_unregister_irq_ack_notifier(struct kvm *kvm, + struct kvm_irq_ack_notifier *kian) +{ + hlist_del(&kian->link); +} diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 07ff2aef0c1..95fe718e3ab 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -83,6 +83,11 @@ static inline int irqchip_in_kernel(struct kvm *kvm) void kvm_pic_reset(struct kvm_kpic_state *s); void kvm_set_irq(struct kvm *kvm, int irq, int level); +void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi); +void kvm_register_irq_ack_notifier(struct kvm *kvm, + struct kvm_irq_ack_notifier *kian); +void kvm_unregister_irq_ack_notifier(struct kvm *kvm, + struct kvm_irq_ack_notifier *kian); void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 83afa10c77f..d451928fc84 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -321,6 +321,12 @@ struct kvm_mem_alias { gfn_t target_gfn; }; +struct kvm_irq_ack_notifier { + struct hlist_node link; + unsigned gsi; + void (*irq_acked)(struct kvm_irq_ack_notifier *kian); +}; + struct kvm_arch{ int naliases; struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; @@ -336,6 +342,7 @@ struct kvm_arch{ struct kvm_pic *vpic; struct kvm_ioapic *vioapic; struct kvm_pit *vpit; + struct hlist_head irq_ack_notifier_list; int round_robin_prev_vcpu; unsigned int tss_addr; -- cgit v1.2.3-70-g09d2 From f52447261bc8c21dfd4635196e32d2da1352f589 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Sat, 26 Jul 2008 17:01:00 -0300 Subject: KVM: irq ack notification Based on a patch from: Ben-Ami Yassour which was based on a patch from: Amit Shah Notify IRQ acking on PIC/APIC emulation. The previous patch missed two things: - Edge triggered interrupts on IOAPIC - PIC reset with IRR/ISR set should be equivalent to ack (LAPIC probably needs something similar). Signed-off-by: Marcelo Tosatti CC: Amit Shah CC: Ben-Ami Yassour Signed-off-by: Avi Kivity --- arch/x86/kvm/i8259.c | 12 +++++++++++- arch/x86/kvm/irq.c | 2 +- arch/x86/kvm/irq.h | 3 ++- arch/x86/kvm/lapic.c | 7 +++++-- virt/kvm/ioapic.c | 20 +++++++++++++------- virt/kvm/ioapic.h | 3 ++- 6 files changed, 34 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 55e179ad98e..de704995b81 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -159,9 +159,10 @@ static inline void pic_intack(struct kvm_kpic_state *s, int irq) s->irr &= ~(1 << irq); } -int kvm_pic_read_irq(struct kvm_pic *s) +int kvm_pic_read_irq(struct kvm *kvm) { int irq, irq2, intno; + struct kvm_pic *s = pic_irqchip(kvm); irq = pic_get_irq(&s->pics[0]); if (irq >= 0) { @@ -187,12 +188,21 @@ int kvm_pic_read_irq(struct kvm_pic *s) intno = s->pics[0].irq_base + irq; } pic_update_irq(s); + kvm_notify_acked_irq(kvm, irq); return intno; } void kvm_pic_reset(struct kvm_kpic_state *s) { + int irq; + struct kvm *kvm = s->pics_state->irq_request_opaque; + + for (irq = 0; irq < PIC_NUM_PINS; irq++) { + if (!(s->imr & (1 << irq)) && (s->irr & (1 << irq) || + s->isr & (1 << irq))) + kvm_notify_acked_irq(kvm, irq); + } s->last_irr = 0; s->irr = 0; s->imr = 0; diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 90911958d85..3c508afaa28 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -72,7 +72,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) if (kvm_apic_accept_pic_intr(v)) { s = pic_irqchip(v->kvm); s->output = 0; /* PIC */ - vector = kvm_pic_read_irq(s); + vector = kvm_pic_read_irq(v->kvm); } } return vector; diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 95fe718e3ab..479a3d2d561 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -63,11 +63,12 @@ struct kvm_pic { void *irq_request_opaque; int output; /* intr from master PIC */ struct kvm_io_device dev; + void (*ack_notifier)(void *opaque, int irq); }; struct kvm_pic *kvm_create_pic(struct kvm *kvm); void kvm_pic_set_irq(void *opaque, int irq, int level); -int kvm_pic_read_irq(struct kvm_pic *s); +int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 9fde0ac2426..be94f93a73f 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -439,7 +439,7 @@ struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, static void apic_set_eoi(struct kvm_lapic *apic) { int vector = apic_find_highest_isr(apic); - + int trigger_mode; /* * Not every write EOI will has corresponding ISR, * one example is when Kernel check timer on setup_IO_APIC @@ -451,7 +451,10 @@ static void apic_set_eoi(struct kvm_lapic *apic) apic_update_ppr(apic); if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR)) - kvm_ioapic_update_eoi(apic->vcpu->kvm, vector); + trigger_mode = IOAPIC_LEVEL_TRIG; + else + trigger_mode = IOAPIC_EDGE_TRIG; + kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); } static void apic_send_ipi(struct kvm_lapic *apic) diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index c0d22870ee9..515cd7ce761 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -39,6 +39,7 @@ #include "ioapic.h" #include "lapic.h" +#include "irq.h" #if 0 #define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) @@ -285,26 +286,31 @@ void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) } } -static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int gsi) +static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int gsi, + int trigger_mode) { union ioapic_redir_entry *ent; ent = &ioapic->redirtbl[gsi]; - ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); - ent->fields.remote_irr = 0; - if (!ent->fields.mask && (ioapic->irr & (1 << gsi))) - ioapic_service(ioapic, gsi); + kvm_notify_acked_irq(ioapic->kvm, gsi); + + if (trigger_mode == IOAPIC_LEVEL_TRIG) { + ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); + ent->fields.remote_irr = 0; + if (!ent->fields.mask && (ioapic->irr & (1 << gsi))) + ioapic_service(ioapic, gsi); + } } -void kvm_ioapic_update_eoi(struct kvm *kvm, int vector) +void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode) { struct kvm_ioapic *ioapic = kvm->arch.vioapic; int i; for (i = 0; i < IOAPIC_NUM_PINS; i++) if (ioapic->redirtbl[i].fields.vector == vector) - __kvm_ioapic_update_eoi(ioapic, i); + __kvm_ioapic_update_eoi(ioapic, i, trigger_mode); } static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr, diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h index 7f16675fe78..b52732f493c 100644 --- a/virt/kvm/ioapic.h +++ b/virt/kvm/ioapic.h @@ -58,6 +58,7 @@ struct kvm_ioapic { } redirtbl[IOAPIC_NUM_PINS]; struct kvm_io_device dev; struct kvm *kvm; + void (*ack_notifier)(void *opaque, int irq); }; #ifdef DEBUG @@ -87,7 +88,7 @@ static inline int irqchip_in_kernel(struct kvm *kvm) struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, unsigned long bitmap); -void kvm_ioapic_update_eoi(struct kvm *kvm, int vector); +void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode); int kvm_ioapic_init(struct kvm *kvm); void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); void kvm_ioapic_reset(struct kvm_ioapic *ioapic); -- cgit v1.2.3-70-g09d2 From 3cf57fed216e2c1b6fdfeccb792650bab72a350a Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Sat, 26 Jul 2008 17:01:01 -0300 Subject: KVM: PIT: fix injection logic and count The PIT injection logic is problematic under the following cases: 1) If there is a higher priority vector to be delivered by the time kvm_pit_timer_intr_post is invoked ps->inject_pending won't be set. This opens the possibility for missing many PIT event injections (say if guest executes hlt at this point). 2) ps->inject_pending is racy with more than two vcpus. Since there's no locking around read/dec of pt->pending, two vcpu's can inject two interrupts for a single pt->pending count. Fix 1 by using an irq ack notifier: only reinject when the previous irq has been acked. Fix 2 with appropriate locking around manipulation of pending count and irq_ack by the injection / ack paths. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 70 ++++++++++++++++++++++++++-------------------------- arch/x86/kvm/i8254.h | 7 +++--- arch/x86/kvm/irq.c | 1 - 3 files changed, 38 insertions(+), 40 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index c0f7872a912..7d04dd3ef85 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -207,6 +207,8 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps) pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); pt->scheduled = ktime_to_ns(pt->timer.expires); + if (pt->period) + ps->channels[0].count_load_time = pt->timer.expires; return (pt->period == 0 ? 0 : 1); } @@ -215,12 +217,22 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu) { struct kvm_pit *pit = vcpu->kvm->arch.vpit; - if (pit && vcpu->vcpu_id == 0 && pit->pit_state.inject_pending) + if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack) return atomic_read(&pit->pit_state.pit_timer.pending); - return 0; } +void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) +{ + struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, + irq_ack_notifier); + spin_lock(&ps->inject_lock); + if (atomic_dec_return(&ps->pit_timer.pending) < 0) + WARN_ON(1); + ps->irq_ack = 1; + spin_unlock(&ps->inject_lock); +} + static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) { struct kvm_kpit_state *ps; @@ -255,8 +267,9 @@ static void destroy_pit_timer(struct kvm_kpit_timer *pt) hrtimer_cancel(&pt->timer); } -static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period) +static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) { + struct kvm_kpit_timer *pt = &ps->pit_timer; s64 interval; interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); @@ -268,6 +281,7 @@ static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period) pt->period = (is_period == 0) ? 0 : interval; pt->timer.function = pit_timer_fn; atomic_set(&pt->pending, 0); + ps->irq_ack = 1; hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval), HRTIMER_MODE_ABS); @@ -302,11 +316,11 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) case 1: /* FIXME: enhance mode 4 precision */ case 4: - create_pit_timer(&ps->pit_timer, val, 0); + create_pit_timer(ps, val, 0); break; case 2: case 3: - create_pit_timer(&ps->pit_timer, val, 1); + create_pit_timer(ps, val, 1); break; default: destroy_pit_timer(&ps->pit_timer); @@ -520,7 +534,7 @@ void kvm_pit_reset(struct kvm_pit *pit) mutex_unlock(&pit->pit_state.lock); atomic_set(&pit->pit_state.pit_timer.pending, 0); - pit->pit_state.inject_pending = 1; + pit->pit_state.irq_ack = 1; } struct kvm_pit *kvm_create_pit(struct kvm *kvm) @@ -534,6 +548,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm) mutex_init(&pit->pit_state.lock); mutex_lock(&pit->pit_state.lock); + spin_lock_init(&pit->pit_state.inject_lock); /* Initialize PIO device */ pit->dev.read = pit_ioport_read; @@ -555,6 +570,9 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm) pit_state->pit = pit; hrtimer_init(&pit_state->pit_timer.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + pit_state->irq_ack_notifier.gsi = 0; + pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq; + kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); mutex_unlock(&pit->pit_state.lock); kvm_pit_reset(pit); @@ -592,37 +610,19 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) struct kvm_kpit_state *ps; if (vcpu && pit) { + int inject = 0; ps = &pit->pit_state; - /* Try to inject pending interrupts when: - * 1. Pending exists - * 2. Last interrupt was accepted or waited for too long time*/ - if (atomic_read(&ps->pit_timer.pending) && - (ps->inject_pending || - (jiffies - ps->last_injected_time - >= KVM_MAX_PIT_INTR_INTERVAL))) { - ps->inject_pending = 0; - __inject_pit_timer_intr(kvm); - ps->last_injected_time = jiffies; - } - } -} - -void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec) -{ - struct kvm_arch *arch = &vcpu->kvm->arch; - struct kvm_kpit_state *ps; - - if (vcpu && arch->vpit) { - ps = &arch->vpit->pit_state; - if (atomic_read(&ps->pit_timer.pending) && - (((arch->vpic->pics[0].imr & 1) == 0 && - arch->vpic->pics[0].irq_base == vec) || - (arch->vioapic->redirtbl[0].fields.vector == vec && - arch->vioapic->redirtbl[0].fields.mask != 1))) { - ps->inject_pending = 1; - atomic_dec(&ps->pit_timer.pending); - ps->channels[0].count_load_time = ktime_get(); + /* Try to inject pending interrupts when + * last one has been acked. + */ + spin_lock(&ps->inject_lock); + if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) { + ps->irq_ack = 0; + inject = 1; } + spin_unlock(&ps->inject_lock); + if (inject) + __inject_pit_timer_intr(kvm); } } diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index db25c2a6c8c..e436d4983aa 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -8,7 +8,6 @@ struct kvm_kpit_timer { int irq; s64 period; /* unit: ns */ s64 scheduled; - ktime_t last_update; atomic_t pending; }; @@ -34,8 +33,9 @@ struct kvm_kpit_state { u32 speaker_data_on; struct mutex lock; struct kvm_pit *pit; - bool inject_pending; /* if inject pending interrupts */ - unsigned long last_injected_time; + spinlock_t inject_lock; + unsigned long irq_ack; + struct kvm_irq_ack_notifier irq_ack_notifier; }; struct kvm_pit { @@ -54,7 +54,6 @@ struct kvm_pit { #define KVM_PIT_CHANNEL_MASK 0x3 void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); -void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec); void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val); struct kvm_pit *kvm_create_pit(struct kvm *kvm); void kvm_free_pit(struct kvm *kvm); diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 3c508afaa28..8c1b9c5def7 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -90,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec) { kvm_apic_timer_intr_post(vcpu, vec); - kvm_pit_timer_intr_post(vcpu, vec); /* TODO: PIT, RTC etc. */ } EXPORT_SYMBOL_GPL(kvm_timer_intr_post); -- cgit v1.2.3-70-g09d2 From 3807f345b2c610336c17c7624a0d496a38df75a0 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 28 Jul 2008 11:47:52 -0300 Subject: x86: paravirt: factor out cpu_khz to common code KVM intends to use paravirt code to calibrate khz. Xen current code will do just fine. So as a first step, factor out code to pvclock.c. Signed-off-by: Glauber Costa Signed-off-by: Avi Kivity --- arch/x86/kernel/pvclock.c | 12 ++++++++++++ arch/x86/xen/time.c | 11 ++--------- include/asm-x86/pvclock.h | 1 + 3 files changed, 15 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 05fbe9a0325..1c54b5fb7ae 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -97,6 +97,18 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, return dst->version; } +unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) +{ + u64 tsc_khz = 1000000ULL << 32; + + do_div(tsc_khz, src->tsc_to_system_mul); + if (src->tsc_shift < 0) + tsc_khz <<= -src->tsc_shift; + else + tsc_khz >>= src->tsc_shift; + return tsc_khz; +} + cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) { struct pvclock_shadow_time shadow; diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 004ba86326a..c9f7cda48ed 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -198,17 +198,10 @@ unsigned long long xen_sched_clock(void) /* Get the TSC speed from Xen */ unsigned long xen_tsc_khz(void) { - u64 xen_khz = 1000000ULL << 32; - const struct pvclock_vcpu_time_info *info = + struct pvclock_vcpu_time_info *info = &HYPERVISOR_shared_info->vcpu_info[0].time; - do_div(xen_khz, info->tsc_to_system_mul); - if (info->tsc_shift < 0) - xen_khz <<= -info->tsc_shift; - else - xen_khz >>= info->tsc_shift; - - return xen_khz; + return pvclock_tsc_khz(info); } cycle_t xen_clocksource_read(void) diff --git a/include/asm-x86/pvclock.h b/include/asm-x86/pvclock.h index 1a38f683480..ad29e277fd6 100644 --- a/include/asm-x86/pvclock.h +++ b/include/asm-x86/pvclock.h @@ -6,6 +6,7 @@ /* some helper functions for xen and kvm pv clock sources */ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); +unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src); void pvclock_read_wallclock(struct pvclock_wall_clock *wall, struct pvclock_vcpu_time_info *vcpu, struct timespec *ts); -- cgit v1.2.3-70-g09d2 From 0293615f3fb9886b6b23800c121be293bb7483e9 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 28 Jul 2008 11:47:53 -0300 Subject: x86: KVM guest: use paravirt function to calculate cpu khz We're currently facing timing problems in guests that do calibration under heavy load, and then the load vanishes. This means we'll have a much lower lpj than we actually should, and delays end up taking less time than they should, which is a nasty bug. Solution is to pass on the lpj value from host to guest, and have it preset. Signed-off-by: Glauber Costa Signed-off-by: Avi Kivity --- arch/x86/kernel/kvmclock.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index d02def06ca9..774ac499156 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -78,6 +78,34 @@ static cycle_t kvm_clock_read(void) return ret; } +/* + * If we don't do that, there is the possibility that the guest + * will calibrate under heavy load - thus, getting a lower lpj - + * and execute the delays themselves without load. This is wrong, + * because no delay loop can finish beforehand. + * Any heuristics is subject to fail, because ultimately, a large + * poll of guests can be running and trouble each other. So we preset + * lpj here + */ +static unsigned long kvm_get_tsc_khz(void) +{ + return preset_lpj; +} + +static void kvm_get_preset_lpj(void) +{ + struct pvclock_vcpu_time_info *src; + unsigned long khz; + u64 lpj; + + src = &per_cpu(hv_clock, 0); + khz = pvclock_tsc_khz(src); + + lpj = ((u64)khz * 1000); + do_div(lpj, HZ); + preset_lpj = lpj; +} + static struct clocksource kvm_clock = { .name = "kvm-clock", .read = kvm_clock_read, @@ -153,6 +181,7 @@ void __init kvmclock_init(void) pv_time_ops.get_wallclock = kvm_get_wallclock; pv_time_ops.set_wallclock = kvm_set_wallclock; pv_time_ops.sched_clock = kvm_clock_read; + pv_time_ops.get_tsc_khz = kvm_get_tsc_khz; #ifdef CONFIG_X86_LOCAL_APIC pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; #endif @@ -163,6 +192,7 @@ void __init kvmclock_init(void) #ifdef CONFIG_KEXEC machine_ops.crash_shutdown = kvm_crash_shutdown; #endif + kvm_get_preset_lpj(); clocksource_register(&kvm_clock); } } -- cgit v1.2.3-70-g09d2 From 4d5c5d0fe89c921336b95f5e7e4f529a9df92f53 Mon Sep 17 00:00:00 2001 From: Ben-Ami Yassour Date: Mon, 28 Jul 2008 19:26:26 +0300 Subject: KVM: pci device assignment Based on a patch from: Amit Shah This patch adds support for handling PCI devices that are assigned to the guest. The device to be assigned to the guest is registered in the host kernel and interrupt delivery is handled. If a device is already assigned, or the device driver for it is still loaded on the host, the device assignment is failed by conveying a -EBUSY reply to the userspace. Devices that share their interrupt line are not supported at the moment. By itself, this patch will not make devices work within the guest. The VT-d extension is required to enable the device to perform DMA. Another alternative is PVDMA. Signed-off-by: Amit Shah Signed-off-by: Ben-Ami Yassour Signed-off-by: Weidong Han Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 243 +++++++++++++++++++++++++++++++++++++++++++++ include/asm-x86/kvm_host.h | 16 +++ include/linux/kvm.h | 19 ++++ 3 files changed, 278 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 94a216562f1..a97157cc42a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4,10 +4,14 @@ * derived from drivers/kvm/kvm_main.c * * Copyright (C) 2006 Qumranet, Inc. + * Copyright (C) 2008 Qumranet, Inc. + * Copyright IBM Corporation, 2008 * * Authors: * Avi Kivity * Yaniv Kamay + * Amit Shah + * Ben-Ami Yassour * * This work is licensed under the terms of the GNU GPL, version 2. See * the COPYING file in the top-level directory. @@ -23,8 +27,10 @@ #include "x86.h" #include +#include #include #include +#include #include #include #include @@ -98,6 +104,219 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { NULL } }; +struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, + int assigned_dev_id) +{ + struct list_head *ptr; + struct kvm_assigned_dev_kernel *match; + + list_for_each(ptr, head) { + match = list_entry(ptr, struct kvm_assigned_dev_kernel, list); + if (match->assigned_dev_id == assigned_dev_id) + return match; + } + return NULL; +} + +static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) +{ + struct kvm_assigned_dev_kernel *assigned_dev; + + assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, + interrupt_work); + + /* This is taken to safely inject irq inside the guest. When + * the interrupt injection (or the ioapic code) uses a + * finer-grained lock, update this + */ + mutex_lock(&assigned_dev->kvm->lock); + kvm_set_irq(assigned_dev->kvm, + assigned_dev->guest_irq, 1); + mutex_unlock(&assigned_dev->kvm->lock); + kvm_put_kvm(assigned_dev->kvm); +} + +/* FIXME: Implement the OR logic needed to make shared interrupts on + * this line behave properly + */ +static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) +{ + struct kvm_assigned_dev_kernel *assigned_dev = + (struct kvm_assigned_dev_kernel *) dev_id; + + kvm_get_kvm(assigned_dev->kvm); + schedule_work(&assigned_dev->interrupt_work); + disable_irq_nosync(irq); + return IRQ_HANDLED; +} + +/* Ack the irq line for an assigned device */ +static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) +{ + struct kvm_assigned_dev_kernel *dev; + + if (kian->gsi == -1) + return; + + dev = container_of(kian, struct kvm_assigned_dev_kernel, + ack_notifier); + kvm_set_irq(dev->kvm, dev->guest_irq, 0); + enable_irq(dev->host_irq); +} + +static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, + struct kvm_assigned_irq + *assigned_irq) +{ + int r = 0; + struct kvm_assigned_dev_kernel *match; + + mutex_lock(&kvm->lock); + + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_irq->assigned_dev_id); + if (!match) { + mutex_unlock(&kvm->lock); + return -EINVAL; + } + + if (match->irq_requested) { + match->guest_irq = assigned_irq->guest_irq; + match->ack_notifier.gsi = assigned_irq->guest_irq; + mutex_unlock(&kvm->lock); + return 0; + } + + INIT_WORK(&match->interrupt_work, + kvm_assigned_dev_interrupt_work_handler); + + if (irqchip_in_kernel(kvm)) { + if (assigned_irq->host_irq) + match->host_irq = assigned_irq->host_irq; + else + match->host_irq = match->dev->irq; + match->guest_irq = assigned_irq->guest_irq; + match->ack_notifier.gsi = assigned_irq->guest_irq; + match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; + kvm_register_irq_ack_notifier(kvm, &match->ack_notifier); + + /* Even though this is PCI, we don't want to use shared + * interrupts. Sharing host devices with guest-assigned devices + * on the same interrupt line is not a happy situation: there + * are going to be long delays in accepting, acking, etc. + */ + if (request_irq(match->host_irq, kvm_assigned_dev_intr, 0, + "kvm_assigned_device", (void *)match)) { + printk(KERN_INFO "%s: couldn't allocate irq for pv " + "device\n", __func__); + r = -EIO; + goto out; + } + } + + match->irq_requested = true; +out: + mutex_unlock(&kvm->lock); + return r; +} + +static int kvm_vm_ioctl_assign_device(struct kvm *kvm, + struct kvm_assigned_pci_dev *assigned_dev) +{ + int r = 0; + struct kvm_assigned_dev_kernel *match; + struct pci_dev *dev; + + mutex_lock(&kvm->lock); + + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_dev->assigned_dev_id); + if (match) { + /* device already assigned */ + r = -EINVAL; + goto out; + } + + match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL); + if (match == NULL) { + printk(KERN_INFO "%s: Couldn't allocate memory\n", + __func__); + r = -ENOMEM; + goto out; + } + dev = pci_get_bus_and_slot(assigned_dev->busnr, + assigned_dev->devfn); + if (!dev) { + printk(KERN_INFO "%s: host device not found\n", __func__); + r = -EINVAL; + goto out_free; + } + if (pci_enable_device(dev)) { + printk(KERN_INFO "%s: Could not enable PCI device\n", __func__); + r = -EBUSY; + goto out_put; + } + r = pci_request_regions(dev, "kvm_assigned_device"); + if (r) { + printk(KERN_INFO "%s: Could not get access to device regions\n", + __func__); + goto out_disable; + } + match->assigned_dev_id = assigned_dev->assigned_dev_id; + match->host_busnr = assigned_dev->busnr; + match->host_devfn = assigned_dev->devfn; + match->dev = dev; + + match->kvm = kvm; + + list_add(&match->list, &kvm->arch.assigned_dev_head); + +out: + mutex_unlock(&kvm->lock); + return r; +out_disable: + pci_disable_device(dev); +out_put: + pci_dev_put(dev); +out_free: + kfree(match); + mutex_unlock(&kvm->lock); + return r; +} + +static void kvm_free_assigned_devices(struct kvm *kvm) +{ + struct list_head *ptr, *ptr2; + struct kvm_assigned_dev_kernel *assigned_dev; + + list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) { + assigned_dev = list_entry(ptr, + struct kvm_assigned_dev_kernel, + list); + + if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested) { + free_irq(assigned_dev->host_irq, + (void *)assigned_dev); + + kvm_unregister_irq_ack_notifier(kvm, + &assigned_dev-> + ack_notifier); + } + + if (cancel_work_sync(&assigned_dev->interrupt_work)) + /* We had pending work. That means we will have to take + * care of kvm_put_kvm. + */ + kvm_put_kvm(kvm); + + pci_release_regions(assigned_dev->dev); + pci_disable_device(assigned_dev->dev); + pci_dev_put(assigned_dev->dev); + + list_del(&assigned_dev->list); + kfree(assigned_dev); + } +} unsigned long segment_base(u16 selector) { @@ -1766,6 +1985,28 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; break; } + case KVM_ASSIGN_PCI_DEVICE: { + struct kvm_assigned_pci_dev assigned_dev; + + r = -EFAULT; + if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) + goto out; + r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev); + if (r) + goto out; + break; + } + case KVM_ASSIGN_IRQ: { + struct kvm_assigned_irq assigned_irq; + + r = -EFAULT; + if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) + goto out; + r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq); + if (r) + goto out; + break; + } case KVM_GET_PIT: { struct kvm_pit_state ps; r = -EFAULT; @@ -3945,6 +4186,7 @@ struct kvm *kvm_arch_create_vm(void) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); return kvm; } @@ -3977,6 +4219,7 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { + kvm_free_assigned_devices(kvm); kvm_free_pit(kvm); kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index d451928fc84..99dddfcecf6 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -327,6 +327,21 @@ struct kvm_irq_ack_notifier { void (*irq_acked)(struct kvm_irq_ack_notifier *kian); }; +struct kvm_assigned_dev_kernel { + struct kvm_irq_ack_notifier ack_notifier; + struct work_struct interrupt_work; + struct list_head list; + struct kvm_assigned_pci_dev assigned_dev; + int assigned_dev_id; + int host_busnr; + int host_devfn; + int host_irq; + int guest_irq; + int irq_requested; + struct pci_dev *dev; + struct kvm *kvm; +}; + struct kvm_arch{ int naliases; struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; @@ -339,6 +354,7 @@ struct kvm_arch{ * Hash table of struct kvm_mmu_page. */ struct list_head active_mmu_pages; + struct list_head assigned_dev_head; struct kvm_pic *vpic; struct kvm_ioapic *vioapic; struct kvm_pit *vpit; diff --git a/include/linux/kvm.h b/include/linux/kvm.h index d29b6488144..ef4bc6f8977 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -383,6 +383,7 @@ struct kvm_trace_rec { #define KVM_CAP_MP_STATE 14 #define KVM_CAP_COALESCED_MMIO 15 #define KVM_CAP_SYNC_MMU 16 /* Changes to host mmap are reflected in guest */ +#define KVM_CAP_DEVICE_ASSIGNMENT 17 /* * ioctls for VM fds @@ -412,6 +413,10 @@ struct kvm_trace_rec { _IOW(KVMIO, 0x67, struct kvm_coalesced_mmio_zone) #define KVM_UNREGISTER_COALESCED_MMIO \ _IOW(KVMIO, 0x68, struct kvm_coalesced_mmio_zone) +#define KVM_ASSIGN_PCI_DEVICE _IOR(KVMIO, 0x69, \ + struct kvm_assigned_pci_dev) +#define KVM_ASSIGN_IRQ _IOR(KVMIO, 0x70, \ + struct kvm_assigned_irq) /* * ioctls for vcpu fds @@ -476,4 +481,18 @@ struct kvm_trace_rec { #define KVM_TRC_STLB_INVAL (KVM_TRC_HANDLER + 0x18) #define KVM_TRC_PPC_INSTR (KVM_TRC_HANDLER + 0x19) +struct kvm_assigned_pci_dev { + __u32 assigned_dev_id; + __u32 busnr; + __u32 devfn; + __u32 flags; +}; + +struct kvm_assigned_irq { + __u32 assigned_dev_id; + __u32 host_irq; + __u32 guest_irq; + __u32 flags; +}; + #endif -- cgit v1.2.3-70-g09d2 From f0d662759a2465babdba1160749c446648c9d159 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 11 Aug 2008 10:01:45 -0700 Subject: KVM: Reduce kvm stack usage in kvm_arch_vm_ioctl() On my machine with gcc 3.4, kvm uses ~2k of stack in a few select functions. This is mostly because gcc fails to notice that the different case: statements could have their stack usage combined. It overflows very nicely if interrupts happen during one of these large uses. This patch uses two methods for reducing stack usage. 1. dynamically allocate large objects instead of putting on the stack. 2. Use a union{} member for all of the case variables. This tricks gcc into combining them all into a single stack allocation. (There's also a comment on this) Signed-off-by: Dave Hansen Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 72 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 45 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a97157cc42a..87d434228fe 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1869,6 +1869,15 @@ long kvm_arch_vm_ioctl(struct file *filp, struct kvm *kvm = filp->private_data; void __user *argp = (void __user *)arg; int r = -EINVAL; + /* + * This union makes it completely explicit to gcc-3.x + * that these two variables' stack usage should be + * combined, not added together. + */ + union { + struct kvm_pit_state ps; + struct kvm_memory_alias alias; + } u; switch (ioctl) { case KVM_SET_TSS_ADDR: @@ -1900,17 +1909,14 @@ long kvm_arch_vm_ioctl(struct file *filp, case KVM_GET_NR_MMU_PAGES: r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); break; - case KVM_SET_MEMORY_ALIAS: { - struct kvm_memory_alias alias; - + case KVM_SET_MEMORY_ALIAS: r = -EFAULT; - if (copy_from_user(&alias, argp, sizeof alias)) + if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias))) goto out; - r = kvm_vm_ioctl_set_memory_alias(kvm, &alias); + r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias); if (r) goto out; break; - } case KVM_CREATE_IRQCHIP: r = -ENOMEM; kvm->arch.vpic = kvm_create_pic(kvm); @@ -1952,37 +1958,51 @@ long kvm_arch_vm_ioctl(struct file *filp, } case KVM_GET_IRQCHIP: { /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ - struct kvm_irqchip chip; + struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); - r = -EFAULT; - if (copy_from_user(&chip, argp, sizeof chip)) + r = -ENOMEM; + if (!chip) goto out; + r = -EFAULT; + if (copy_from_user(chip, argp, sizeof *chip)) + goto get_irqchip_out; r = -ENXIO; if (!irqchip_in_kernel(kvm)) - goto out; - r = kvm_vm_ioctl_get_irqchip(kvm, &chip); + goto get_irqchip_out; + r = kvm_vm_ioctl_get_irqchip(kvm, chip); if (r) - goto out; + goto get_irqchip_out; r = -EFAULT; - if (copy_to_user(argp, &chip, sizeof chip)) - goto out; + if (copy_to_user(argp, chip, sizeof *chip)) + goto get_irqchip_out; r = 0; + get_irqchip_out: + kfree(chip); + if (r) + goto out; break; } case KVM_SET_IRQCHIP: { /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ - struct kvm_irqchip chip; + struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); - r = -EFAULT; - if (copy_from_user(&chip, argp, sizeof chip)) + r = -ENOMEM; + if (!chip) goto out; + r = -EFAULT; + if (copy_from_user(chip, argp, sizeof *chip)) + goto set_irqchip_out; r = -ENXIO; if (!irqchip_in_kernel(kvm)) - goto out; - r = kvm_vm_ioctl_set_irqchip(kvm, &chip); + goto set_irqchip_out; + r = kvm_vm_ioctl_set_irqchip(kvm, chip); if (r) - goto out; + goto set_irqchip_out; r = 0; + set_irqchip_out: + kfree(chip); + if (r) + goto out; break; } case KVM_ASSIGN_PCI_DEVICE: { @@ -2008,31 +2028,29 @@ long kvm_arch_vm_ioctl(struct file *filp, break; } case KVM_GET_PIT: { - struct kvm_pit_state ps; r = -EFAULT; - if (copy_from_user(&ps, argp, sizeof ps)) + if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state))) goto out; r = -ENXIO; if (!kvm->arch.vpit) goto out; - r = kvm_vm_ioctl_get_pit(kvm, &ps); + r = kvm_vm_ioctl_get_pit(kvm, &u.ps); if (r) goto out; r = -EFAULT; - if (copy_to_user(argp, &ps, sizeof ps)) + if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state))) goto out; r = 0; break; } case KVM_SET_PIT: { - struct kvm_pit_state ps; r = -EFAULT; - if (copy_from_user(&ps, argp, sizeof ps)) + if (copy_from_user(&u.ps, argp, sizeof u.ps)) goto out; r = -ENXIO; if (!kvm->arch.vpit) goto out; - r = kvm_vm_ioctl_set_pit(kvm, &ps); + r = kvm_vm_ioctl_set_pit(kvm, &u.ps); if (r) goto out; r = 0; -- cgit v1.2.3-70-g09d2 From b772ff362ec6b821c8a5227a3355e263f917bfad Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 11 Aug 2008 10:01:47 -0700 Subject: KVM: Reduce stack usage in kvm_arch_vcpu_ioctl() [sheng: fix KVM_GET_LAPIC using wrong size] Signed-off-by: Dave Hansen Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 87d434228fe..f1b0223c408 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1542,28 +1542,33 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; int r; + struct kvm_lapic_state *lapic = NULL; switch (ioctl) { case KVM_GET_LAPIC: { - struct kvm_lapic_state lapic; + lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); - memset(&lapic, 0, sizeof lapic); - r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic); + r = -ENOMEM; + if (!lapic) + goto out; + r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic); if (r) goto out; r = -EFAULT; - if (copy_to_user(argp, &lapic, sizeof lapic)) + if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state))) goto out; r = 0; break; } case KVM_SET_LAPIC: { - struct kvm_lapic_state lapic; - + lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); + r = -ENOMEM; + if (!lapic) + goto out; r = -EFAULT; - if (copy_from_user(&lapic, argp, sizeof lapic)) + if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state))) goto out; - r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);; + r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic); if (r) goto out; r = 0; @@ -1661,6 +1666,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; } out: + if (lapic) + kfree(lapic); return r; } -- cgit v1.2.3-70-g09d2 From 6ad18fba05228fb1d47cdbc0339fe8b3fca1ca26 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 11 Aug 2008 10:01:49 -0700 Subject: KVM: Reduce stack usage in kvm_pv_mmu_op() We're in a hot path. We can't use kmalloc() because it might impact performance. So, we just stick the buffer that we need into the kvm_vcpu_arch structure. This is used very often, so it is not really a waste. We also have to move the buffer structure's definition to the arch-specific x86 kvm header. Signed-off-by: Dave Hansen Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 23 ++++++++--------------- include/asm-x86/kvm_host.h | 10 ++++++++++ 2 files changed, 18 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c3afbfe6b0c..171bcea1be2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -135,13 +135,6 @@ module_param(dbg, bool, 0644); #define ACC_USER_MASK PT_USER_MASK #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) -struct kvm_pv_mmu_op_buffer { - void *ptr; - unsigned len; - unsigned processed; - char buf[512] __aligned(sizeof(long)); -}; - struct kvm_rmap_desc { u64 *shadow_ptes[RMAP_EXT]; struct kvm_rmap_desc *more; @@ -2292,18 +2285,18 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, gpa_t addr, unsigned long *ret) { int r; - struct kvm_pv_mmu_op_buffer buffer; + struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer; - buffer.ptr = buffer.buf; - buffer.len = min_t(unsigned long, bytes, sizeof buffer.buf); - buffer.processed = 0; + buffer->ptr = buffer->buf; + buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf); + buffer->processed = 0; - r = kvm_read_guest(vcpu->kvm, addr, buffer.buf, buffer.len); + r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len); if (r) goto out; - while (buffer.len) { - r = kvm_pv_mmu_op_one(vcpu, &buffer); + while (buffer->len) { + r = kvm_pv_mmu_op_one(vcpu, buffer); if (r < 0) goto out; if (r == 0) @@ -2312,7 +2305,7 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, r = 1; out: - *ret = buffer.processed; + *ret = buffer->processed; return r; } diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 99dddfcecf6..9cb4b4dae5c 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -201,6 +201,13 @@ struct kvm_mmu_page { }; }; +struct kvm_pv_mmu_op_buffer { + void *ptr; + unsigned len; + unsigned processed; + char buf[512] __aligned(sizeof(long)); +}; + /* * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level * 32-bit). The kvm_mmu structure abstracts the details of the current mmu @@ -248,6 +255,9 @@ struct kvm_vcpu_arch { bool tpr_access_reporting; struct kvm_mmu mmu; + /* only needed in kvm_pv_mmu_op() path, but it's hot so + * put it here to avoid allocation */ + struct kvm_pv_mmu_op_buffer mmu_op_buffer; struct kvm_mmu_memory_cache mmu_pte_chain_cache; struct kvm_mmu_memory_cache mmu_rmap_desc_cache; -- cgit v1.2.3-70-g09d2 From 464d17c8b747deb77d1bf8c14cc4f28aab2a4952 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Wed, 13 Aug 2008 14:10:33 +0800 Subject: KVM: VMX: Clean up magic number 0x66 in init_rmode_tss Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ddb49e34697..229e2d06fa2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1732,7 +1732,8 @@ static int init_rmode_tss(struct kvm *kvm) if (r < 0) goto out; data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; - r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16)); + r = kvm_write_guest_page(kvm, fn++, &data, + TSS_IOPB_BASE_OFFSET, sizeof(u16)); if (r < 0) goto out; r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); -- cgit v1.2.3-70-g09d2 From 29415c37f043d1d54dcf356601d738ff6633b72b Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 1 Aug 2008 20:09:13 -0300 Subject: KVM: set debug registers after "schedulable" section The vcpu thread can be preempted after the guest_debug_pre() callback, resulting in invalid debug registers on the new vcpu. Move it inside the non-preemptable section. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f1b0223c408..4a033757a19 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3113,10 +3113,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) down_read(&vcpu->kvm->slots_lock); vapic_enter(vcpu); -preempted: - if (vcpu->guest_debug.enabled) - kvm_x86_ops->guest_debug_pre(vcpu); - again: if (vcpu->requests) if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) @@ -3170,6 +3166,9 @@ again: goto out; } + if (vcpu->guest_debug.enabled) + kvm_x86_ops->guest_debug_pre(vcpu); + vcpu->guest_mode = 1; /* * Make sure that guest_mode assignment won't happen after @@ -3244,7 +3243,7 @@ out: if (r > 0) { kvm_resched(vcpu); down_read(&vcpu->kvm->slots_lock); - goto preempted; + goto again; } post_kvm_run_save(vcpu, kvm_run); -- cgit v1.2.3-70-g09d2 From ecfc79c700b02c5ad1ccae58718015caa84824be Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 14 Aug 2008 11:13:16 +0300 Subject: KVM: VMX: Use interrupt queue for !irqchip_in_kernel Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 229e2d06fa2..81db7d48ab8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2173,7 +2173,7 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); if (!vcpu->arch.irq_pending[word_index]) clear_bit(word_index, &vcpu->arch.irq_summary); - vmx_inject_irq(vcpu, irq); + kvm_queue_interrupt(vcpu, irq); } @@ -2187,13 +2187,12 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); if (vcpu->arch.interrupt_window_open && - vcpu->arch.irq_summary && - !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK)) - /* - * If interrupts enabled, and not blocked by sti or mov ss. Good. - */ + vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending) kvm_do_inject_irq(vcpu); + if (vcpu->arch.interrupt_window_open && vcpu->arch.interrupt.pending) + vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); if (!vcpu->arch.interrupt_window_open && (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) -- cgit v1.2.3-70-g09d2 From 85428ac7c39ab5fff23b5d14ccb32941e9401285 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 14 Aug 2008 20:53:25 -0300 Subject: KVM: fix i8259 reset irq acking The irq ack during pic reset has three problems: - Ignores slave/master PIC, using gsi 0-8 for both. - Generates an ACK even if the APIC is in control. - Depends upon IMR being clear, which is broken if the irq was masked at the time it was generated. The last one causes the BIOS to hang after the first reboot of Windows installation, since PIT interrupts stop. [avi: fix check whether pic interrupts are seen by cpu] Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/i8259.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index de704995b81..71e3eeeccae 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -195,13 +195,19 @@ int kvm_pic_read_irq(struct kvm *kvm) void kvm_pic_reset(struct kvm_kpic_state *s) { - int irq; + int irq, irqbase; struct kvm *kvm = s->pics_state->irq_request_opaque; + struct kvm_vcpu *vcpu0 = kvm->vcpus[0]; - for (irq = 0; irq < PIC_NUM_PINS; irq++) { - if (!(s->imr & (1 << irq)) && (s->irr & (1 << irq) || - s->isr & (1 << irq))) - kvm_notify_acked_irq(kvm, irq); + if (s == &s->pics_state->pics[0]) + irqbase = 0; + else + irqbase = 8; + + for (irq = 0; irq < PIC_NUM_PINS/2; irq++) { + if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0)) + if (s->irr & (1 << irq) || s->isr & (1 << irq)) + kvm_notify_acked_irq(kvm, irq+irqbase); } s->last_irr = 0; s->irr = 0; -- cgit v1.2.3-70-g09d2 From dc7404cea34ef997dfe89ca94d16358e9d29c8d8 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 17 Aug 2008 16:03:46 +0300 Subject: KVM: Handle spurious acks for PIT interrupts Spurious acks can be generated, for example if the PIC is being reset. Handle those acks gracefully rather than flooding the log with warnings. Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 7d04dd3ef85..c842060c6c0 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -228,7 +228,7 @@ void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) irq_ack_notifier); spin_lock(&ps->inject_lock); if (atomic_dec_return(&ps->pit_timer.pending) < 0) - WARN_ON(1); + atomic_inc(&ps->pit_timer.pending); ps->irq_ack = 1; spin_unlock(&ps->inject_lock); } -- cgit v1.2.3-70-g09d2 From 6762b7299aa115e11815decd1fd982d015f09615 Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Wed, 13 Aug 2008 16:22:37 +0300 Subject: KVM: Device assignment: Check for privileges before assigning irq Even though we don't share irqs at the moment, we should ensure regular user processes don't try to allocate system resources. We check for capability to access IO devices (CAP_SYS_RAWIO) before we request_irq on behalf of the guest. Noticed by Avi. Signed-off-by: Amit Shah Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4a033757a19..fffdf4f69c5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -191,6 +191,11 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, kvm_assigned_dev_interrupt_work_handler); if (irqchip_in_kernel(kvm)) { + if (!capable(CAP_SYS_RAWIO)) { + return -EPERM; + goto out; + } + if (assigned_irq->host_irq) match->host_irq = assigned_irq->host_irq; else -- cgit v1.2.3-70-g09d2 From 648dfaa7df2d3692db4e63dcb18dccb275d9c5a7 Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Sun, 17 Aug 2008 16:38:32 +0300 Subject: KVM: VMX: Add Guest State Validity Checks This patch adds functions to check whether guest state is VMX compliant. Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 180 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 180 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 81db7d48ab8..e889b768c75 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1721,6 +1721,186 @@ static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) vmcs_writel(GUEST_GDTR_BASE, dt->base); } +static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) +{ + struct kvm_segment var; + u32 ar; + + vmx_get_segment(vcpu, &var, seg); + ar = vmx_segment_access_rights(&var); + + if (var.base != (var.selector << 4)) + return false; + if (var.limit != 0xffff) + return false; + if (ar != 0xf3) + return false; + + return true; +} + +static bool code_segment_valid(struct kvm_vcpu *vcpu) +{ + struct kvm_segment cs; + unsigned int cs_rpl; + + vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); + cs_rpl = cs.selector & SELECTOR_RPL_MASK; + + if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK)) + return false; + if (!cs.s) + return false; + if (!(~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK))) { + if (cs.dpl > cs_rpl) + return false; + } else if (cs.type & AR_TYPE_CODE_MASK) { + if (cs.dpl != cs_rpl) + return false; + } + if (!cs.present) + return false; + + /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ + return true; +} + +static bool stack_segment_valid(struct kvm_vcpu *vcpu) +{ + struct kvm_segment ss; + unsigned int ss_rpl; + + vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); + ss_rpl = ss.selector & SELECTOR_RPL_MASK; + + if ((ss.type != 3) || (ss.type != 7)) + return false; + if (!ss.s) + return false; + if (ss.dpl != ss_rpl) /* DPL != RPL */ + return false; + if (!ss.present) + return false; + + return true; +} + +static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) +{ + struct kvm_segment var; + unsigned int rpl; + + vmx_get_segment(vcpu, &var, seg); + rpl = var.selector & SELECTOR_RPL_MASK; + + if (!var.s) + return false; + if (!var.present) + return false; + if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) { + if (var.dpl < rpl) /* DPL < RPL */ + return false; + } + + /* TODO: Add other members to kvm_segment_field to allow checking for other access + * rights flags + */ + return true; +} + +static bool tr_valid(struct kvm_vcpu *vcpu) +{ + struct kvm_segment tr; + + vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); + + if (tr.selector & SELECTOR_TI_MASK) /* TI = 1 */ + return false; + if ((tr.type != 3) || (tr.type != 11)) /* TODO: Check if guest is in IA32e mode */ + return false; + if (!tr.present) + return false; + + return true; +} + +static bool ldtr_valid(struct kvm_vcpu *vcpu) +{ + struct kvm_segment ldtr; + + vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); + + if (ldtr.selector & SELECTOR_TI_MASK) /* TI = 1 */ + return false; + if (ldtr.type != 2) + return false; + if (!ldtr.present) + return false; + + return true; +} + +static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) +{ + struct kvm_segment cs, ss; + + vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); + vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); + + return ((cs.selector & SELECTOR_RPL_MASK) == + (ss.selector & SELECTOR_RPL_MASK)); +} + +/* + * Check if guest state is valid. Returns true if valid, false if + * not. + * We assume that registers are always usable + */ +static bool guest_state_valid(struct kvm_vcpu *vcpu) +{ + /* real mode guest state checks */ + if (!(vcpu->arch.cr0 & X86_CR0_PE)) { + if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) + return false; + } else { + /* protected mode guest state checks */ + if (!cs_ss_rpl_check(vcpu)) + return false; + if (!code_segment_valid(vcpu)) + return false; + if (!stack_segment_valid(vcpu)) + return false; + if (!data_segment_valid(vcpu, VCPU_SREG_DS)) + return false; + if (!data_segment_valid(vcpu, VCPU_SREG_ES)) + return false; + if (!data_segment_valid(vcpu, VCPU_SREG_FS)) + return false; + if (!data_segment_valid(vcpu, VCPU_SREG_GS)) + return false; + if (!tr_valid(vcpu)) + return false; + if (!ldtr_valid(vcpu)) + return false; + } + /* TODO: + * - Add checks on RIP + * - Add checks on RFLAGS + */ + + return true; +} + static int init_rmode_tss(struct kvm *kvm) { gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; -- cgit v1.2.3-70-g09d2 From 04fa4d32117b1a7773290fd59a97cf90cfc2a0d4 Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Sun, 17 Aug 2008 16:39:48 +0300 Subject: KVM: VMX: Add module parameter and emulation flag. The patch adds the module parameter required to enable emulating invalid guest state, as well as the emulation_required flag used to drive emulation whenever needed. Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e889b768c75..7c5f611e1a9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -49,6 +49,9 @@ module_param(flexpriority_enabled, bool, 0); static int enable_ept = 1; module_param(enable_ept, bool, 0); +static int emulate_invalid_guest_state = 0; +module_param(emulate_invalid_guest_state, bool, 0); + struct vmcs { u32 revision_id; u32 abort; @@ -86,6 +89,7 @@ struct vcpu_vmx { } irq; } rmode; int vpid; + bool emulation_required; }; static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) -- cgit v1.2.3-70-g09d2 From ea953ef0ca84e778187905177e2a789a1974837b Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Sun, 17 Aug 2008 16:47:05 +0300 Subject: KVM: VMX: Add invalid guest state handler This adds the invalid guest state handler function which invokes the x86 emulator until getting the guest to a VMX-friendly state. [avi: leave atomic context if scheduling] [guillaume: return to atomic context correctly] Signed-off-by: Laurent Vivier Signed-off-by: Guillaume Thouvenin Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7c5f611e1a9..eae1f2c64f9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2892,6 +2892,43 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } +static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int err; + + preempt_enable(); + local_irq_enable(); + + while (!guest_state_valid(vcpu)) { + err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); + + switch (err) { + case EMULATE_DONE: + break; + case EMULATE_DO_MMIO: + kvm_report_emulation_failure(vcpu, "mmio"); + /* TODO: Handle MMIO */ + return; + default: + kvm_report_emulation_failure(vcpu, "emulation failure"); + return; + } + + if (signal_pending(current)) + break; + if (need_resched()) + schedule(); + } + + local_irq_disable(); + preempt_disable(); + + /* Guest state should be valid now, no more emulation should be needed */ + vmx->emulation_required = 0; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs -- cgit v1.2.3-70-g09d2 From a89a8fb93ba63d4ad39db97c90997c409c87ccb9 Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Sun, 17 Aug 2008 16:42:16 +0300 Subject: KVM: VMX: Modify mode switching and vmentry functions This patch modifies mode switching and vmentry function in order to drive invalid guest state emulation. Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index eae1f2c64f9..9840f37925a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1298,7 +1298,9 @@ static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) static void enter_pmode(struct kvm_vcpu *vcpu) { unsigned long flags; + struct vcpu_vmx *vmx = to_vmx(vcpu); + vmx->emulation_required = 1; vcpu->arch.rmode.active = 0; vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); @@ -1315,6 +1317,9 @@ static void enter_pmode(struct kvm_vcpu *vcpu) update_exception_bitmap(vcpu); + if (emulate_invalid_guest_state) + return; + fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es); fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); @@ -1355,7 +1360,9 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save) static void enter_rmode(struct kvm_vcpu *vcpu) { unsigned long flags; + struct vcpu_vmx *vmx = to_vmx(vcpu); + vmx->emulation_required = 1; vcpu->arch.rmode.active = 1; vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); @@ -1377,6 +1384,9 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); update_exception_bitmap(vcpu); + if (emulate_invalid_guest_state) + goto continue_rmode; + vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); vmcs_write32(GUEST_SS_LIMIT, 0xffff); vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); @@ -1392,6 +1402,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); +continue_rmode: kvm_mmu_reset_context(vcpu); init_rmode(vcpu->kvm); } @@ -2317,6 +2328,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) ret = 0; + /* HACK: Don't enable emulation on guest boot/reset */ + vmx->emulation_required = 0; + out: up_read(&vcpu->kvm->slots_lock); return ret; @@ -3190,6 +3204,12 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) struct vcpu_vmx *vmx = to_vmx(vcpu); u32 intr_info; + /* Handle invalid guest state instead of entering VMX */ + if (vmx->emulation_required && emulate_invalid_guest_state) { + handle_invalid_guest_state(vcpu, kvm_run); + return; + } + if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) -- cgit v1.2.3-70-g09d2 From 94c935a1ee7a9c134f0ebed656384186619d05cb Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Mon, 18 Aug 2008 13:11:46 +0300 Subject: KVM: SVM: Fix typo Fix typo in as-yet unused macro definition. Signed-off-by: Amit Shah Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 179c2e00d0f..be86c096385 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -44,7 +44,7 @@ MODULE_LICENSE("GPL"); #define SVM_FEATURE_NPT (1 << 0) #define SVM_FEATURE_LBRV (1 << 1) -#define SVM_DEATURE_SVML (1 << 2) +#define SVM_FEATURE_SVML (1 << 2) #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) -- cgit v1.2.3-70-g09d2 From 29c8fa32c5d1e2d26d53ad9467b3a13130014cdf Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Mon, 18 Aug 2008 15:07:05 +0300 Subject: KVM: Use kvm_set_irq to inject interrupts ... instead of using the pic and ioapic variants Signed-off-by: Amit Shah Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 6 ++---- arch/x86/kvm/x86.c | 8 +------- 2 files changed, 3 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index c842060c6c0..fdaa0f00e47 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -596,10 +596,8 @@ void kvm_free_pit(struct kvm *kvm) static void __inject_pit_timer_intr(struct kvm *kvm) { mutex_lock(&kvm->lock); - kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1); - kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 0); - kvm_pic_set_irq(pic_irqchip(kvm), 0, 1); - kvm_pic_set_irq(pic_irqchip(kvm), 0, 0); + kvm_set_irq(kvm, 0, 1); + kvm_set_irq(kvm, 0, 0); mutex_unlock(&kvm->lock); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fffdf4f69c5..5b3c8821b19 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1956,13 +1956,7 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; if (irqchip_in_kernel(kvm)) { mutex_lock(&kvm->lock); - if (irq_event.irq < 16) - kvm_pic_set_irq(pic_irqchip(kvm), - irq_event.irq, - irq_event.level); - kvm_ioapic_set_irq(kvm->arch.vioapic, - irq_event.irq, - irq_event.level); + kvm_set_irq(kvm, irq_event.irq, irq_event.level); mutex_unlock(&kvm->lock); r = 0; } -- cgit v1.2.3-70-g09d2 From ee032c993edd34e0bdf64dab06a55d0e08a4eeb9 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Mon, 11 Aug 2008 16:54:20 -0700 Subject: KVM: make irq ack notifier functions static sparse says: arch/x86/kvm/x86.c:107:32: warning: symbol 'kvm_find_assigned_dev' was not declared. Should it be static? arch/x86/kvm/i8254.c:225:6: warning: symbol 'kvm_pit_ack_irq' was not declared. Should it be static? Signed-off-by: Harvey Harrison Signed-off-by: Andrew Morton Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 2 +- arch/x86/kvm/x86.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index fdaa0f00e47..4cb443026ec 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -222,7 +222,7 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu) return 0; } -void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) +static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) { struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, irq_ack_notifier); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5b3c8821b19..22edd95712e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -104,7 +104,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { NULL } }; -struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, +static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, int assigned_dev_id) { struct list_head *ptr; -- cgit v1.2.3-70-g09d2 From 5706be0dafd6f42852f85fbae292301dcad4ccec Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 20 Aug 2008 15:07:31 +0300 Subject: KVM: VMX: Change cs reset state to be a data segment Real mode cs is a data segment, not a code segment. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9840f37925a..6aa305ace79 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2239,6 +2239,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) fx_init(&vmx->vcpu); + seg_setup(VCPU_SREG_CS); /* * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. @@ -2250,8 +2251,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); } - vmcs_write32(GUEST_CS_LIMIT, 0xffff); - vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); seg_setup(VCPU_SREG_DS); seg_setup(VCPU_SREG_ES); -- cgit v1.2.3-70-g09d2 From a16b20da879430fdf245ed45461ed40ffef8db3c Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 20 Aug 2008 15:48:27 +0300 Subject: KVM: VMX: Change segment dpl at reset to 3 This is more emulation friendly, if not 100% correct. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6aa305ace79..71e57ae1cab 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1991,7 +1991,7 @@ static void seg_setup(int seg) vmcs_write16(sf->selector, 0); vmcs_writel(sf->base, 0); vmcs_write32(sf->limit, 0xffff); - vmcs_write32(sf->ar_bytes, 0x93); + vmcs_write32(sf->ar_bytes, 0xf3); } static int alloc_apic_access_page(struct kvm *kvm) -- cgit v1.2.3-70-g09d2 From f4bbd9aaaae23007e4d79536d35a30cbbb11d407 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 20 Aug 2008 15:51:42 +0300 Subject: KVM: Load real mode segments correctly Real mode segments to not reference the GDT or LDT; they simply compute base = selector * 16. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 22edd95712e..bfc7c332c5d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3588,11 +3588,33 @@ static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, return 0; } +int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) +{ + struct kvm_segment segvar = { + .base = selector << 4, + .limit = 0xffff, + .selector = selector, + .type = 3, + .present = 1, + .dpl = 3, + .db = 0, + .s = 1, + .l = 0, + .g = 0, + .avl = 0, + .unusable = 0, + }; + kvm_x86_ops->set_segment(vcpu, &segvar, seg); + return 0; +} + int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int type_bits, int seg) { struct kvm_segment kvm_seg; + if (!(vcpu->arch.cr0 & X86_CR0_PE)) + return kvm_load_realmode_segment(vcpu, selector, seg); if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) return 1; kvm_seg.type |= type_bits; -- cgit v1.2.3-70-g09d2 From 41afa025878bc31c9c4e18415fba2435fe035376 Mon Sep 17 00:00:00 2001 From: roel kluin Date: Mon, 18 Aug 2008 21:25:01 -0400 Subject: KVM: x86 emulator: remove duplicate SrcImm Signed-off-by: Roel Kluin Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index d5da7f14d53..5d6c1444b61 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -269,7 +269,7 @@ static u16 group_table[] = { ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 0, 0, 0, 0, [Group3*8] = - DstMem | SrcImm | ModRM | SrcImm, 0, + DstMem | SrcImm | ModRM, 0, DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 0, 0, 0, 0, [Group4*8] = -- cgit v1.2.3-70-g09d2 From 6eb06cb2863a2ff5704b501f1699216180e790b5 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 21 Aug 2008 17:41:39 +0300 Subject: KVM: x86 emulator: remove bad ByteOp specifier from NEG descriptor Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 5d6c1444b61..ae30435ad33 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -270,7 +270,7 @@ static u16 group_table[] = { 0, 0, 0, 0, [Group3*8] = DstMem | SrcImm | ModRM, 0, - DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, + DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0, 0, 0, [Group4*8] = ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, -- cgit v1.2.3-70-g09d2 From 135f8c2b078533cc74e75f696e73d47304a61125 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 21 Aug 2008 17:49:56 +0300 Subject: KVM: MMU: Move SHADOW_PT_INDEX to mmu.c It is not specific to the paging mode, so can be made global (and reusable). Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 ++ arch/x86/kvm/paging_tmpl.h | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 171bcea1be2..51d4cd7ae4f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -135,6 +135,8 @@ module_param(dbg, bool, 0644); #define ACC_USER_MASK PT_USER_MASK #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) +#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) + struct kvm_rmap_desc { u64 *shadow_ptes[RMAP_EXT]; struct kvm_rmap_desc *more; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 4a814bff21f..ebb26a09d31 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -29,7 +29,6 @@ #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK #define PT_INDEX(addr, level) PT64_INDEX(addr, level) - #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) #define PT_LEVEL_BITS PT64_LEVEL_BITS #ifdef CONFIG_X86_64 @@ -46,7 +45,6 @@ #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK #define PT_INDEX(addr, level) PT32_INDEX(addr, level) - #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) #define PT_LEVEL_BITS PT32_LEVEL_BITS #define PT_MAX_FULL_LEVELS 2 @@ -504,7 +502,6 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, #undef FNAME #undef PT_BASE_ADDR_MASK #undef PT_INDEX -#undef SHADOW_PT_INDEX #undef PT_LEVEL_MASK #undef PT_DIR_BASE_ADDR_MASK #undef PT_LEVEL_BITS -- cgit v1.2.3-70-g09d2 From 6e37d3dc3e358dbf907f8b96a51282966934124b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Fri, 22 Aug 2008 19:14:17 +0300 Subject: KVM: MMU: Unify direct map 4K and large page paths The two paths are equivalent except for one argument, which is already available. Merge the two codepaths. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 51d4cd7ae4f..3ee856f6812 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1240,15 +1240,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, ASSERT(VALID_PAGE(table_addr)); table = __va(table_addr); - if (level == 1) { + if (level == 1 || (largepage && level == 2)) { mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, - 0, write, 1, &pt_write, 0, gfn, pfn, false); - return pt_write; - } - - if (largepage && level == 2) { - mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, - 0, write, 1, &pt_write, 1, gfn, pfn, false); + 0, write, 1, &pt_write, largepage, + gfn, pfn, false); return pt_write; } -- cgit v1.2.3-70-g09d2 From 6c41f428b72afe5a581b967590c12538db31d399 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 26 Aug 2008 16:16:08 +0300 Subject: KVM: MMU: Infer shadow root level in direct_map() In all cases the shadow root level is available in mmu.shadow_root_level, so there is no need to pass it as a parameter. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3ee856f6812..72f739aa862 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1227,11 +1227,11 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) } static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, - int largepage, gfn_t gfn, pfn_t pfn, - int level) + int largepage, gfn_t gfn, pfn_t pfn) { hpa_t table_addr = vcpu->arch.mmu.root_hpa; int pt_write = 0; + int level = vcpu->arch.mmu.shadow_root_level; for (; ; level--) { u32 index = PT64_INDEX(v, level); @@ -1299,8 +1299,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; kvm_mmu_free_some_pages(vcpu); - r = __direct_map(vcpu, v, write, largepage, gfn, pfn, - PT32E_ROOT_LEVEL); + r = __direct_map(vcpu, v, write, largepage, gfn, pfn); spin_unlock(&vcpu->kvm->mmu_lock); @@ -1455,7 +1454,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, goto out_unlock; kvm_mmu_free_some_pages(vcpu); r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, - largepage, gfn, pfn, kvm_x86_ops->get_tdp_level()); + largepage, gfn, pfn); spin_unlock(&vcpu->kvm->mmu_lock); return r; -- cgit v1.2.3-70-g09d2 From 3d000db5688c8beff6319fb9ff4b98dcac32f798 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Fri, 22 Aug 2008 19:24:38 +0300 Subject: KVM: MMU: Add generic shadow walker We currently walk the shadow page tables in two places: direct map (for real mode and two dimensional paging) and paging mode shadow. Since we anticipate requiring a third walk (for invlpg), it makes sense to have a generic facility for shadow walk. This patch adds such a shadow walker, walks the page tables and calls a method for every spte encountered. The method can examine the spte, modify it, or even instantiate it. The walk can be aborted by returning nonzero from the method. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 72f739aa862..8b95cf748b5 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -142,6 +142,11 @@ struct kvm_rmap_desc { struct kvm_rmap_desc *more; }; +struct kvm_shadow_walk { + int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu, + gva_t addr, u64 *spte, int level); +}; + static struct kmem_cache *pte_chain_cache; static struct kmem_cache *rmap_desc_cache; static struct kmem_cache *mmu_page_header_cache; @@ -935,6 +940,35 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, return sp; } +static int walk_shadow(struct kvm_shadow_walk *walker, + struct kvm_vcpu *vcpu, gva_t addr) +{ + hpa_t shadow_addr; + int level; + int r; + u64 *sptep; + unsigned index; + + shadow_addr = vcpu->arch.mmu.root_hpa; + level = vcpu->arch.mmu.shadow_root_level; + if (level == PT32E_ROOT_LEVEL) { + shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; + shadow_addr &= PT64_BASE_ADDR_MASK; + --level; + } + + while (level >= PT_PAGE_TABLE_LEVEL) { + index = SHADOW_PT_INDEX(addr, level); + sptep = ((u64 *)__va(shadow_addr)) + index; + r = walker->entry(walker, vcpu, addr, sptep, level); + if (r) + return r; + shadow_addr = *sptep & PT64_BASE_ADDR_MASK; + --level; + } + return 0; +} + static void kvm_mmu_page_unlink_children(struct kvm *kvm, struct kvm_mmu_page *sp) { -- cgit v1.2.3-70-g09d2 From 140754bc80e1cdbf2d14cdb10d900da1f7718e7b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Fri, 22 Aug 2008 19:28:04 +0300 Subject: KVM: MMU: Convert direct maps to use the generic shadow walker Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 93 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 55 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 8b95cf748b5..a1ca4ff9c11 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1260,49 +1260,66 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) { } -static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, - int largepage, gfn_t gfn, pfn_t pfn) -{ - hpa_t table_addr = vcpu->arch.mmu.root_hpa; - int pt_write = 0; - int level = vcpu->arch.mmu.shadow_root_level; - - for (; ; level--) { - u32 index = PT64_INDEX(v, level); - u64 *table; +struct direct_shadow_walk { + struct kvm_shadow_walk walker; + pfn_t pfn; + int write; + int largepage; + int pt_write; +}; - ASSERT(VALID_PAGE(table_addr)); - table = __va(table_addr); +static int direct_map_entry(struct kvm_shadow_walk *_walk, + struct kvm_vcpu *vcpu, + gva_t addr, u64 *sptep, int level) +{ + struct direct_shadow_walk *walk = + container_of(_walk, struct direct_shadow_walk, walker); + struct kvm_mmu_page *sp; + gfn_t pseudo_gfn; + gfn_t gfn = addr >> PAGE_SHIFT; + + if (level == PT_PAGE_TABLE_LEVEL + || (walk->largepage && level == PT_DIRECTORY_LEVEL)) { + mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL, + 0, walk->write, 1, &walk->pt_write, + walk->largepage, gfn, walk->pfn, false); + return 1; + } - if (level == 1 || (largepage && level == 2)) { - mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, - 0, write, 1, &pt_write, largepage, - gfn, pfn, false); - return pt_write; + if (*sptep == shadow_trap_nonpresent_pte) { + pseudo_gfn = (addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; + sp = kvm_mmu_get_page(vcpu, pseudo_gfn, addr, level - 1, + 1, ACC_ALL, sptep); + if (!sp) { + pgprintk("nonpaging_map: ENOMEM\n"); + kvm_release_pfn_clean(walk->pfn); + return -ENOMEM; } - if (table[index] == shadow_trap_nonpresent_pte) { - struct kvm_mmu_page *new_table; - gfn_t pseudo_gfn; - - pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK) - >> PAGE_SHIFT; - new_table = kvm_mmu_get_page(vcpu, pseudo_gfn, - v, level - 1, - 1, ACC_ALL, &table[index]); - if (!new_table) { - pgprintk("nonpaging_map: ENOMEM\n"); - kvm_release_pfn_clean(pfn); - return -ENOMEM; - } - - set_shadow_pte(&table[index], - __pa(new_table->spt) - | PT_PRESENT_MASK | PT_WRITABLE_MASK - | shadow_user_mask | shadow_x_mask); - } - table_addr = table[index] & PT64_BASE_ADDR_MASK; + set_shadow_pte(sptep, + __pa(sp->spt) + | PT_PRESENT_MASK | PT_WRITABLE_MASK + | shadow_user_mask | shadow_x_mask); } + return 0; +} + +static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, + int largepage, gfn_t gfn, pfn_t pfn) +{ + int r; + struct direct_shadow_walk walker = { + .walker = { .entry = direct_map_entry, }, + .pfn = pfn, + .largepage = largepage, + .write = write, + .pt_write = 0, + }; + + r = walk_shadow(&walker.walker, vcpu, (gva_t)gfn << PAGE_SHIFT); + if (r < 0) + return r; + return walker.pt_write; } static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) -- cgit v1.2.3-70-g09d2 From abb9e0b8e33e58ac8e04e03b680c46435bc312fd Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Fri, 22 Aug 2008 19:11:39 +0300 Subject: KVM: MMU: Convert the paging mode shadow walk to use the generic walker Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 158 ++++++++++++++++++++++++--------------------- 1 file changed, 86 insertions(+), 72 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index ebb26a09d31..b7064e1e1e1 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -25,6 +25,7 @@ #if PTTYPE == 64 #define pt_element_t u64 #define guest_walker guest_walker64 + #define shadow_walker shadow_walker64 #define FNAME(name) paging##64_##name #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK @@ -41,6 +42,7 @@ #elif PTTYPE == 32 #define pt_element_t u32 #define guest_walker guest_walker32 + #define shadow_walker shadow_walker32 #define FNAME(name) paging##32_##name #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK @@ -71,6 +73,17 @@ struct guest_walker { u32 error_code; }; +struct shadow_walker { + struct kvm_shadow_walk walker; + struct guest_walker *guest_walker; + int user_fault; + int write_fault; + int largepage; + int *ptwrite; + pfn_t pfn; + u64 *sptep; +}; + static gfn_t gpte_to_gfn(pt_element_t gpte) { return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; @@ -272,86 +285,86 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, /* * Fetch a shadow pte for a specific level in the paging hierarchy. */ -static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, - struct guest_walker *walker, - int user_fault, int write_fault, int largepage, - int *ptwrite, pfn_t pfn) +static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw, + struct kvm_vcpu *vcpu, gva_t addr, + u64 *sptep, int level) { - hpa_t shadow_addr; - int level; - u64 *shadow_ent; - unsigned access = walker->pt_access; - - if (!is_present_pte(walker->ptes[walker->level - 1])) - return NULL; - - shadow_addr = vcpu->arch.mmu.root_hpa; - level = vcpu->arch.mmu.shadow_root_level; - if (level == PT32E_ROOT_LEVEL) { - shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; - shadow_addr &= PT64_BASE_ADDR_MASK; - --level; + struct shadow_walker *sw = + container_of(_sw, struct shadow_walker, walker); + struct guest_walker *gw = sw->guest_walker; + unsigned access = gw->pt_access; + struct kvm_mmu_page *shadow_page; + u64 spte; + int metaphysical; + gfn_t table_gfn; + int r; + pt_element_t curr_pte; + + if (level == PT_PAGE_TABLE_LEVEL + || (sw->largepage && level == PT_DIRECTORY_LEVEL)) { + mmu_set_spte(vcpu, sptep, access, gw->pte_access & access, + sw->user_fault, sw->write_fault, + gw->ptes[gw->level-1] & PT_DIRTY_MASK, + sw->ptwrite, sw->largepage, gw->gfn, sw->pfn, + false); + sw->sptep = sptep; + return 1; } - for (; ; level--) { - u32 index = SHADOW_PT_INDEX(addr, level); - struct kvm_mmu_page *shadow_page; - u64 shadow_pte; - int metaphysical; - gfn_t table_gfn; - - shadow_ent = ((u64 *)__va(shadow_addr)) + index; - if (level == PT_PAGE_TABLE_LEVEL) - break; - - if (largepage && level == PT_DIRECTORY_LEVEL) - break; - - if (is_shadow_present_pte(*shadow_ent) - && !is_large_pte(*shadow_ent)) { - shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; - continue; - } + if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) + return 0; - if (is_large_pte(*shadow_ent)) - rmap_remove(vcpu->kvm, shadow_ent); - - if (level - 1 == PT_PAGE_TABLE_LEVEL - && walker->level == PT_DIRECTORY_LEVEL) { - metaphysical = 1; - if (!is_dirty_pte(walker->ptes[level - 1])) - access &= ~ACC_WRITE_MASK; - table_gfn = gpte_to_gfn(walker->ptes[level - 1]); - } else { - metaphysical = 0; - table_gfn = walker->table_gfn[level - 2]; - } - shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, - metaphysical, access, - shadow_ent); - if (!metaphysical) { - int r; - pt_element_t curr_pte; - r = kvm_read_guest_atomic(vcpu->kvm, - walker->pte_gpa[level - 2], - &curr_pte, sizeof(curr_pte)); - if (r || curr_pte != walker->ptes[level - 2]) { - kvm_release_pfn_clean(pfn); - return NULL; - } + if (is_large_pte(*sptep)) + rmap_remove(vcpu->kvm, sptep); + + if (level == PT_DIRECTORY_LEVEL && gw->level == PT_DIRECTORY_LEVEL) { + metaphysical = 1; + if (!is_dirty_pte(gw->ptes[level - 1])) + access &= ~ACC_WRITE_MASK; + table_gfn = gpte_to_gfn(gw->ptes[level - 1]); + } else { + metaphysical = 0; + table_gfn = gw->table_gfn[level - 2]; + } + shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, + metaphysical, access, sptep); + if (!metaphysical) { + r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2], + &curr_pte, sizeof(curr_pte)); + if (r || curr_pte != gw->ptes[level - 2]) { + kvm_release_pfn_clean(sw->pfn); + sw->sptep = NULL; + return 1; } - shadow_addr = __pa(shadow_page->spt); - shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK - | PT_WRITABLE_MASK | PT_USER_MASK; - set_shadow_pte(shadow_ent, shadow_pte); } - mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, - user_fault, write_fault, - walker->ptes[walker->level-1] & PT_DIRTY_MASK, - ptwrite, largepage, walker->gfn, pfn, false); + spte = __pa(shadow_page->spt) | PT_PRESENT_MASK | PT_ACCESSED_MASK + | PT_WRITABLE_MASK | PT_USER_MASK; + *sptep = spte; + return 0; +} + +static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, + struct guest_walker *guest_walker, + int user_fault, int write_fault, int largepage, + int *ptwrite, pfn_t pfn) +{ + struct shadow_walker walker = { + .walker = { .entry = FNAME(shadow_walk_entry), }, + .guest_walker = guest_walker, + .user_fault = user_fault, + .write_fault = write_fault, + .largepage = largepage, + .ptwrite = ptwrite, + .pfn = pfn, + }; + + if (!is_present_pte(guest_walker->ptes[guest_walker->level - 1])) + return NULL; + + walk_shadow(&walker.walker, vcpu, addr); - return shadow_ent; + return walker.sptep; } /* @@ -499,6 +512,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, #undef pt_element_t #undef guest_walker +#undef shadow_walker #undef FNAME #undef PT_BASE_ADDR_MASK #undef PT_INDEX -- cgit v1.2.3-70-g09d2 From acee3c04e8208c17aad1baff99baa68d71640a19 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 26 Aug 2008 17:22:47 +0300 Subject: KVM: Allocate guest memory as MAP_PRIVATE, not MAP_SHARED There is no reason to share internal memory slots with fork()ed instances. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bfc7c332c5d..675d010995a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4296,7 +4296,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm, userspace_addr = do_mmap(NULL, 0, npages * PAGE_SIZE, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, + MAP_PRIVATE | MAP_ANONYMOUS, 0); up_write(¤t->mm->mmap_sem); -- cgit v1.2.3-70-g09d2 From a5e2e82b8b62acd24a44b851e6bb4fd0793ead01 Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Wed, 27 Aug 2008 05:02:56 +0300 Subject: KVM: x86 emulator: Add mov r, imm instructions (opcodes 0xb0-0xbf) The emulator only supported one instance of mov r, imm instruction (opcode 0xb8), this adds the rest of these instructions. Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index ae30435ad33..66e0bd6c628 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -154,9 +154,16 @@ static u16 opcode_table[256] = { 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, ByteOp | ImplicitOps | String, ImplicitOps | String, - /* 0xB0 - 0xBF */ - 0, 0, 0, 0, 0, 0, 0, 0, - DstReg | SrcImm | Mov, 0, 0, 0, 0, 0, 0, 0, + /* 0xB0 - 0xB7 */ + ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, + ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, + ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, + ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, + /* 0xB8 - 0xBF */ + DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, + DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, + DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, + DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, /* 0xC0 - 0xC7 */ ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, 0, ImplicitOps | Stack, 0, 0, @@ -1660,7 +1667,7 @@ special_insn: case 0xae ... 0xaf: /* scas */ DPRINTF("Urk! I don't handle SCAS.\n"); goto cannot_emulate; - case 0xb8: /* mov r, imm */ + case 0xb0 ... 0xbf: /* mov r, imm */ goto mov; case 0xc0 ... 0xc1: emulate_grp2(ctxt); -- cgit v1.2.3-70-g09d2 From bc2d429979451d69d0985c5dbdf908cace2831cc Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 27 Aug 2008 16:30:56 +0300 Subject: KVM: MMU: Account for npt/ept/realmode page faults Now that two-dimensional paging is becoming common, account for tdp page faults. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a1ca4ff9c11..a24da8f2ee9 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1283,6 +1283,7 @@ static int direct_map_entry(struct kvm_shadow_walk *_walk, mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL, 0, walk->write, 1, &walk->pt_write, walk->largepage, gfn, walk->pfn, false); + ++vcpu->stat.pf_fixed; return 1; } -- cgit v1.2.3-70-g09d2 From 2245a28fe2e6fdb1bdabc4dcde1ea3a5c37e2a9e Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 27 Aug 2008 16:32:24 +0300 Subject: KVM: MMU: Add locking around kvm_mmu_slot_remove_write_access() It was generally safe due to slots_lock being held for write, but it wasn't very nice. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a24da8f2ee9..5052acdc0a7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2097,6 +2097,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) { struct kvm_mmu_page *sp; + spin_lock(&kvm->mmu_lock); list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { int i; u64 *pt; @@ -2110,6 +2111,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) if (pt[i] & PT_WRITABLE_MASK) pt[i] &= ~PT_WRITABLE_MASK; } + spin_unlock(&kvm->mmu_lock); } void kvm_mmu_zap_all(struct kvm *kvm) -- cgit v1.2.3-70-g09d2 From 171d595d3b3254b9a952af8d1f6965d2e85dcbaa Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 27 Aug 2008 16:40:51 +0300 Subject: KVM: MMU: Flush tlbs after clearing write permission when accessing dirty log Otherwise, the cpu may allow writes to the tracked pages, and we lose some display bits or fail to migrate correctly. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5052acdc0a7..853a2889b20 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2111,6 +2111,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) if (pt[i] & PT_WRITABLE_MASK) pt[i] &= ~PT_WRITABLE_MASK; } + kvm_flush_remote_tlbs(kvm); spin_unlock(&kvm->mmu_lock); } -- cgit v1.2.3-70-g09d2 From 3201b5d9f0f7ef392886cd76dcd2c69186d9d5cd Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 27 Aug 2008 20:01:04 +0300 Subject: KVM: MMU: Fix setting the accessed bit on non-speculative sptes The accessed bit was accidentally turned on in a random flag word, rather than, the spte itself, which was lucky, since it used the non-EPT compatible PT_ACCESSED_MASK. Fix by turning the bit on in the spte and changing it to use the portable accessed mask. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 853a2889b20..866d7133cad 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1192,7 +1192,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, */ spte = shadow_base_present_pte | shadow_dirty_mask; if (!speculative) - pte_access |= PT_ACCESSED_MASK; + spte |= shadow_accessed_mask; if (!dirty) pte_access &= ~ACC_WRITE_MASK; if (pte_access & ACC_EXEC_MASK) -- cgit v1.2.3-70-g09d2 From 48d150394999c542b2e828f03da226842950d46a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 28 Aug 2008 18:27:15 +0300 Subject: KVM: SVM: No need to unprotect memory during event injection when using npt No memory is protected anyway. Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index be86c096385..60228888d1b 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1021,7 +1021,7 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) if (npt_enabled) svm_flush_tlb(&svm->vcpu); - if (event_injection) + if (!npt_enabled && event_injection) kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); } -- cgit v1.2.3-70-g09d2 From a89c1ad270ca7ad0eec2667bc754362ce7b142be Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 29 Aug 2008 11:52:07 +0200 Subject: KVM: add MC5_MISC msr read support Currently KVM implements MC0-MC4_MISC read support. When booting Linux this results in KVM warnings in the kernel log when the guest tries to read MC5_MISC. Fix this warnings with this patch. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 675d010995a..e3b89662cf6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -991,6 +991,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_MC0_MISC+8: case MSR_IA32_MC0_MISC+12: case MSR_IA32_MC0_MISC+16: + case MSR_IA32_MC0_MISC+20: case MSR_IA32_UCODE_REV: case MSR_IA32_EBL_CR_POWERON: case MSR_IA32_DEBUGCTLMSR: -- cgit v1.2.3-70-g09d2 From fb4616f43148c5b3f3e453a47657572d1bda39ee Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Mon, 1 Sep 2008 04:52:24 +0300 Subject: KVM: x86 emulator: Add std and cld instructions (opcodes 0xfc-0xfd) This adds the std and cld instructions to the emulator. Encountered while running the BIOS with invalid guest state emulation enabled. Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 66e0bd6c628..944f1f4d4be 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -187,7 +187,7 @@ static u16 opcode_table[256] = { ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, /* 0xF8 - 0xFF */ ImplicitOps, 0, ImplicitOps, ImplicitOps, - 0, 0, Group | Group4, Group | Group5, + ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, }; static u16 twobyte_table[256] = { @@ -1762,6 +1762,14 @@ special_insn: ctxt->eflags |= X86_EFLAGS_IF; c->dst.type = OP_NONE; /* Disable writeback. */ break; + case 0xfc: /* cld */ + ctxt->eflags &= ~EFLG_DF; + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0xfd: /* std */ + ctxt->eflags |= EFLG_DF; + c->dst.type = OP_NONE; /* Disable writeback. */ + break; case 0xfe ... 0xff: /* Grp4/Grp5 */ rc = emulate_grp45(ctxt, ops); if (rc != 0) -- cgit v1.2.3-70-g09d2 From d40a1ee4859c673677c9811ae84475c4051baca5 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Mon, 1 Sep 2008 19:41:20 +0800 Subject: KVM: MMU: Modify kvm_shadow_walk.entry to accept u64 addr EPT is 4 level by default in 32pae(48 bits), but the addr parameter of kvm_shadow_walk->entry() only accept unsigned long as virtual address, which is 32bit in 32pae. This result in SHADOW_PT_INDEX() overflow when try to fetch level 4 index. Fix it by extend kvm_shadow_walk->entry() to accept 64bit addr in parameter. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 10 +++++----- arch/x86/kvm/paging_tmpl.h | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 866d7133cad..bce3e25ec79 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -144,7 +144,7 @@ struct kvm_rmap_desc { struct kvm_shadow_walk { int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu, - gva_t addr, u64 *spte, int level); + u64 addr, u64 *spte, int level); }; static struct kmem_cache *pte_chain_cache; @@ -941,7 +941,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, } static int walk_shadow(struct kvm_shadow_walk *walker, - struct kvm_vcpu *vcpu, gva_t addr) + struct kvm_vcpu *vcpu, u64 addr) { hpa_t shadow_addr; int level; @@ -1270,7 +1270,7 @@ struct direct_shadow_walk { static int direct_map_entry(struct kvm_shadow_walk *_walk, struct kvm_vcpu *vcpu, - gva_t addr, u64 *sptep, int level) + u64 addr, u64 *sptep, int level) { struct direct_shadow_walk *walk = container_of(_walk, struct direct_shadow_walk, walker); @@ -1289,7 +1289,7 @@ static int direct_map_entry(struct kvm_shadow_walk *_walk, if (*sptep == shadow_trap_nonpresent_pte) { pseudo_gfn = (addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; - sp = kvm_mmu_get_page(vcpu, pseudo_gfn, addr, level - 1, + sp = kvm_mmu_get_page(vcpu, pseudo_gfn, (gva_t)addr, level - 1, 1, ACC_ALL, sptep); if (!sp) { pgprintk("nonpaging_map: ENOMEM\n"); @@ -1317,7 +1317,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, .pt_write = 0, }; - r = walk_shadow(&walker.walker, vcpu, (gva_t)gfn << PAGE_SHIFT); + r = walk_shadow(&walker.walker, vcpu, gfn << PAGE_SHIFT); if (r < 0) return r; return walker.pt_write; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index b7064e1e1e1..b671f61be41 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -286,7 +286,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, * Fetch a shadow pte for a specific level in the paging hierarchy. */ static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw, - struct kvm_vcpu *vcpu, gva_t addr, + struct kvm_vcpu *vcpu, u64 addr, u64 *sptep, int level) { struct shadow_walker *sw = @@ -326,7 +326,7 @@ static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw, metaphysical = 0; table_gfn = gw->table_gfn[level - 2]; } - shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, + shadow_page = kvm_mmu_get_page(vcpu, table_gfn, (gva_t)addr, level-1, metaphysical, access, sptep); if (!metaphysical) { r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2], -- cgit v1.2.3-70-g09d2 From fa89a81766e33343fa8f0fe0e371819bf94a17a1 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 1 Sep 2008 15:57:51 +0300 Subject: KVM: Add statistics for guest irq injections These can help show whether a guest is making progress or not. Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 1 + arch/x86/kvm/vmx.c | 1 + arch/x86/kvm/x86.c | 1 + include/asm-x86/kvm_host.h | 1 + 4 files changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 60228888d1b..9b54550fa4d 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1519,6 +1519,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler); + ++svm->vcpu.stat.irq_injections; control = &svm->vmcb->control; control->int_vector = irq; control->int_ctl &= ~V_INTR_PRIO_MASK; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 71e57ae1cab..e7e8c86f1b7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2341,6 +2341,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); + ++vcpu->stat.irq_injections; if (vcpu->arch.rmode.active) { vmx->rmode.irq.pending = true; vmx->rmode.irq.vector = irq; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e3b89662cf6..3f3cb7107c0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -92,6 +92,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "fpu_reload", VCPU_STAT(fpu_reload) }, { "insn_emulation", VCPU_STAT(insn_emulation) }, { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, + { "irq_injections", VCPU_STAT(irq_injections) }, { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, { "mmu_pte_write", VM_STAT(mmu_pte_write) }, { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index b6d26b80b75..68a3ac13afc 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -413,6 +413,7 @@ struct kvm_vcpu_stat { u32 insn_emulation; u32 insn_emulation_fail; u32 hypercalls; + u32 irq_injections; }; struct descriptor_table { -- cgit v1.2.3-70-g09d2 From a6a3034cb979b1fa3948d8e1e91b2387fc66b89b Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Sat, 6 Sep 2008 17:22:29 +0300 Subject: KVM: x86 emulator: Add in/out instructions (opcodes 0xe4-0xe7, 0xec-0xef) The patch adds in/out instructions to the x86 emulator. The instruction was encountered while running the BIOS while using the invalid guest state emulation patch. Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 944f1f4d4be..3ac2f148522 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -177,11 +177,14 @@ static u16 opcode_table[256] = { /* 0xD8 - 0xDF */ 0, 0, 0, 0, 0, 0, 0, 0, /* 0xE0 - 0xE7 */ - 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* 0xE8 - 0xEF */ ImplicitOps | Stack, SrcImm | ImplicitOps, ImplicitOps, SrcImmByte | ImplicitOps, - 0, 0, 0, 0, + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* 0xF0 - 0xF7 */ 0, 0, 0, 0, ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, @@ -1259,6 +1262,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) u64 msr_data; unsigned long saved_eip = 0; struct decode_cache *c = &ctxt->decode; + unsigned int port; + int io_dir_in; int rc = 0; /* Shadow copy of register state. Committed on successful emulation. @@ -1687,6 +1692,16 @@ special_insn: c->src.val = c->regs[VCPU_REGS_RCX]; emulate_grp2(ctxt); break; + case 0xe4: /* inb */ + case 0xe5: /* in */ + port = insn_fetch(u8, 1, c->eip); + io_dir_in = 1; + goto do_io; + case 0xe6: /* outb */ + case 0xe7: /* out */ + port = insn_fetch(u8, 1, c->eip); + io_dir_in = 0; + goto do_io; case 0xe8: /* call (near) */ { long int rel; switch (c->op_bytes) { @@ -1737,6 +1752,22 @@ special_insn: jmp_rel(c, c->src.val); c->dst.type = OP_NONE; /* Disable writeback. */ break; + case 0xec: /* in al,dx */ + case 0xed: /* in (e/r)ax,dx */ + port = c->regs[VCPU_REGS_RDX]; + io_dir_in = 1; + goto do_io; + case 0xee: /* out al,dx */ + case 0xef: /* out (e/r)ax,dx */ + port = c->regs[VCPU_REGS_RDX]; + io_dir_in = 0; + do_io: if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in, + (c->d & ByteOp) ? 1 : c->op_bytes, + port) != 0) { + c->eip = saved_eip; + goto cannot_emulate; + } + return 0; case 0xf4: /* hlt */ ctxt->vcpu->arch.halt_request = 1; break; -- cgit v1.2.3-70-g09d2 From d76901750ab9f71091d33ef3d2b5909d8a9a4ad4 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 8 Sep 2008 15:23:48 -0300 Subject: KVM: x86: do not execute halted vcpus Offline or uninitialized vcpu's can be executed if requested to perform userspace work. Follow Avi's suggestion to handle halted vcpu's in the main loop, simplifying kvm_emulate_halt(). Introduce a new vcpu->requests bit to indicate events that promote state from halted to running. Also standardize vcpu wake sites. Signed-off-by: Marcelo Tosatti redhat.com> Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 5 +-- arch/x86/kvm/lapic.c | 16 ++------ arch/x86/kvm/x86.c | 100 ++++++++++++++++++++++++++--------------------- include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 10 ++--- 5 files changed, 67 insertions(+), 65 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 4cb443026ec..634132a9a51 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -200,10 +200,9 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps) if (!atomic_inc_and_test(&pt->pending)) set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); - if (vcpu0 && waitqueue_active(&vcpu0->wq)) { - vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; + + if (vcpu0 && waitqueue_active(&vcpu0->wq)) wake_up_interruptible(&vcpu0->wq); - } pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); pt->scheduled = ktime_to_ns(pt->timer.expires); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index be94f93a73f..fd00f698692 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -339,13 +339,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, } else apic_clear_vector(vector, apic->regs + APIC_TMR); - if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) - kvm_vcpu_kick(vcpu); - else if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) { - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - if (waitqueue_active(&vcpu->wq)) - wake_up_interruptible(&vcpu->wq); - } + kvm_vcpu_kick(vcpu); result = (orig_irr == 0); break; @@ -384,8 +378,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { vcpu->arch.sipi_vector = vector; vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; - if (waitqueue_active(&vcpu->wq)) - wake_up_interruptible(&vcpu->wq); + kvm_vcpu_kick(vcpu); } break; @@ -950,10 +943,9 @@ static int __apic_timer_fn(struct kvm_lapic *apic) if(!atomic_inc_and_test(&apic->timer.pending)) set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests); - if (waitqueue_active(q)) { - apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + if (waitqueue_active(q)) wake_up_interruptible(q); - } + if (apic_lvtt_period(apic)) { result = 1; apic->timer.dev.expires = ktime_add_ns( diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3f3cb7107c0..bf98d40b21e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2798,11 +2798,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) KVMTRACE_0D(HLT, vcpu, handler); if (irqchip_in_kernel(vcpu->kvm)) { vcpu->arch.mp_state = KVM_MP_STATE_HALTED; - up_read(&vcpu->kvm->slots_lock); - kvm_vcpu_block(vcpu); - down_read(&vcpu->kvm->slots_lock); - if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) - return -EINTR; return 1; } else { vcpu->run->exit_reason = KVM_EXIT_HLT; @@ -3097,24 +3092,10 @@ static void vapic_exit(struct kvm_vcpu *vcpu) up_read(&vcpu->kvm->slots_lock); } -static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; - if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { - pr_debug("vcpu %d received sipi with vector # %x\n", - vcpu->vcpu_id, vcpu->arch.sipi_vector); - kvm_lapic_reset(vcpu); - r = kvm_x86_ops->vcpu_reset(vcpu); - if (r) - return r; - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - } - - down_read(&vcpu->kvm->slots_lock); - vapic_enter(vcpu); - -again: if (vcpu->requests) if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) kvm_mmu_unload(vcpu); @@ -3151,22 +3132,13 @@ again: local_irq_disable(); - if (vcpu->requests || need_resched()) { + if (vcpu->requests || need_resched() || signal_pending(current)) { local_irq_enable(); preempt_enable(); r = 1; goto out; } - if (signal_pending(current)) { - local_irq_enable(); - preempt_enable(); - r = -EINTR; - kvm_run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.signal_exits; - goto out; - } - if (vcpu->guest_debug.enabled) kvm_x86_ops->guest_debug_pre(vcpu); @@ -3227,26 +3199,63 @@ again: kvm_lapic_sync_from_vapic(vcpu); r = kvm_x86_ops->handle_exit(kvm_run, vcpu); +out: + return r; +} - if (r > 0) { - if (dm_request_for_irq_injection(vcpu, kvm_run)) { - r = -EINTR; - kvm_run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.request_irq_exits; - goto out; - } - if (!need_resched()) - goto again; +static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + int r; + + if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { + printk("vcpu %d received sipi with vector # %x\n", + vcpu->vcpu_id, vcpu->arch.sipi_vector); + kvm_lapic_reset(vcpu); + r = kvm_x86_ops->vcpu_reset(vcpu); + if (r) + return r; + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } -out: - up_read(&vcpu->kvm->slots_lock); - if (r > 0) { - kvm_resched(vcpu); - down_read(&vcpu->kvm->slots_lock); - goto again; + down_read(&vcpu->kvm->slots_lock); + vapic_enter(vcpu); + + r = 1; + while (r > 0) { + if (kvm_arch_vcpu_runnable(vcpu)) + r = vcpu_enter_guest(vcpu, kvm_run); + else { + up_read(&vcpu->kvm->slots_lock); + kvm_vcpu_block(vcpu); + down_read(&vcpu->kvm->slots_lock); + if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) + if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) + vcpu->arch.mp_state = + KVM_MP_STATE_RUNNABLE; + if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) + r = -EINTR; + } + + if (r > 0) { + if (dm_request_for_irq_injection(vcpu, kvm_run)) { + r = -EINTR; + kvm_run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.request_irq_exits; + } + if (signal_pending(current)) { + r = -EINTR; + kvm_run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.signal_exits; + } + if (need_resched()) { + up_read(&vcpu->kvm->slots_lock); + kvm_resched(vcpu); + down_read(&vcpu->kvm->slots_lock); + } + } } + up_read(&vcpu->kvm->slots_lock); post_kvm_run_save(vcpu, kvm_run); vapic_exit(vcpu); @@ -3266,6 +3275,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { kvm_vcpu_block(vcpu); + clear_bit(KVM_REQ_UNHALT, &vcpu->requests); r = -EAGAIN; goto out; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index a18aaad2ab7..4b036430ea2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -34,6 +34,7 @@ #define KVM_REQ_MMU_RELOAD 3 #define KVM_REQ_TRIPLE_FAULT 4 #define KVM_REQ_PENDING_TIMER 5 +#define KVM_REQ_UNHALT 6 struct kvm_vcpu; extern struct kmem_cache *kvm_vcpu_cache; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index de3b029f6ad..63e661be040 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -980,12 +980,12 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) for (;;) { prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); - if (kvm_cpu_has_interrupt(vcpu)) - break; - if (kvm_cpu_has_pending_timer(vcpu)) - break; - if (kvm_arch_vcpu_runnable(vcpu)) + if (kvm_cpu_has_interrupt(vcpu) || + kvm_cpu_has_pending_timer(vcpu) || + kvm_arch_vcpu_runnable(vcpu)) { + set_bit(KVM_REQ_UNHALT, &vcpu->requests); break; + } if (signal_pending(current)) break; -- cgit v1.2.3-70-g09d2 From d19292e457a7c1b7f6c12bccbfdfd53630de1cee Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Mon, 8 Sep 2008 21:47:19 +0300 Subject: KVM: x86 emulator: Add call near absolute instruction (opcode 0xff/2) Add call near absolute instruction. Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 3ac2f148522..0630d219876 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -286,7 +286,8 @@ static u16 group_table[] = { ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 0, 0, 0, 0, 0, 0, [Group5*8] = - DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0, + DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, + SrcMem | ModRM | Stack, 0, SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0, [Group7*8] = 0, 0, ModRM | SrcMem, ModRM | SrcMem, @@ -1162,6 +1163,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, case 1: /* dec */ emulate_1op("dec", c->dst, ctxt->eflags); break; + case 2: /* call near abs */ { + long int old_eip; + old_eip = c->eip; + c->eip = c->src.val; + c->src.val = old_eip; + emulate_push(ctxt); + break; + } case 4: /* jmp abs */ c->eip = c->src.val; break; -- cgit v1.2.3-70-g09d2 From 9c3e4aab5ae27bf8589d61c7874e213832b4b7d2 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 10 Sep 2008 16:40:55 -0300 Subject: KVM: x86: unhalt vcpu0 on reset Since "KVM: x86: do not execute halted vcpus", HLT by vcpu0 before system reset by the IO thread will hang the guest. Mark vcpu as runnable in such case. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bf98d40b21e..2134f3e0a51 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3959,6 +3959,12 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); + /* Older userspace won't unhalt the vcpu on reset. */ + if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 && + sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && + !(vcpu->arch.cr0 & X86_CR0_PE)) + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + vcpu_put(vcpu); return 0; -- cgit v1.2.3-70-g09d2 From 4b92fe0c9d0376eb105c88d49b77595e8e5b1548 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Thu, 11 Sep 2008 12:58:00 +0200 Subject: KVM: VMX: Cleanup stalled INTR_INFO read Commit 1c0f4f5011829dac96347b5f84ba37c2252e1e08 left a useless access of VM_ENTRY_INTR_INFO_FIELD in vmx_intr_assist behind. Clean this up. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e7e8c86f1b7..f8e615fc874 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3135,11 +3135,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) static void vmx_intr_assist(struct kvm_vcpu *vcpu) { - u32 intr_info_field; - update_tpr_threshold(vcpu); - intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); if (cpu_has_virtual_nmis()) { if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { if (vmx_nmi_enabled(vcpu)) { -- cgit v1.2.3-70-g09d2 From ef46f18ea010359f7536afd3f56a92c9c83ac09a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 11 Sep 2008 19:47:13 +0300 Subject: KVM: x86 emulator: fix jmp r/m64 instruction jmp r/m64 doesn't require the rex.w prefix to indicate the operand size is 64 bits. Set the Stack attribute (even though it doesn't involve the stack, really) to indicate this. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 0630d219876..0c120c4c9c0 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -288,7 +288,7 @@ static u16 group_table[] = { [Group5*8] = DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, SrcMem | ModRM | Stack, 0, - SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0, + SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, [Group7*8] = 0, 0, ModRM | SrcMem, ModRM | SrcMem, SrcNone | ModRM | DstMem | Mov, 0, -- cgit v1.2.3-70-g09d2 From 9ea542facbd0fd12a53e169953a3fdd6d0364532 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Thu, 11 Sep 2008 15:27:49 +0800 Subject: KVM: VMX: Rename IA32_FEATURE_CONTROL bits Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 18 +++++++++--------- arch/x86/kvm/vmx.h | 4 ++-- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f8e615fc874..046a91b5a4b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1041,9 +1041,9 @@ static __init int vmx_disabled_by_bios(void) u64 msr; rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); - return (msr & (IA32_FEATURE_CONTROL_LOCKED_BIT | - IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT)) - == IA32_FEATURE_CONTROL_LOCKED_BIT; + return (msr & (FEATURE_CONTROL_LOCKED | + FEATURE_CONTROL_VMXON_ENABLED)) + == FEATURE_CONTROL_LOCKED; /* locked but not enabled */ } @@ -1055,14 +1055,14 @@ static void hardware_enable(void *garbage) INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); rdmsrl(MSR_IA32_FEATURE_CONTROL, old); - if ((old & (IA32_FEATURE_CONTROL_LOCKED_BIT | - IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT)) - != (IA32_FEATURE_CONTROL_LOCKED_BIT | - IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT)) + if ((old & (FEATURE_CONTROL_LOCKED | + FEATURE_CONTROL_VMXON_ENABLED)) + != (FEATURE_CONTROL_LOCKED | + FEATURE_CONTROL_VMXON_ENABLED)) /* enable and lock */ wrmsrl(MSR_IA32_FEATURE_CONTROL, old | - IA32_FEATURE_CONTROL_LOCKED_BIT | - IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT); + FEATURE_CONTROL_LOCKED | + FEATURE_CONTROL_VMXON_ENABLED); write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr) diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h index 86059f439cb..44cfab70644 100644 --- a/arch/x86/kvm/vmx.h +++ b/arch/x86/kvm/vmx.h @@ -331,8 +331,8 @@ enum vmcs_field { #define AR_RESERVD_MASK 0xfffe0f00 -#define IA32_FEATURE_CONTROL_LOCKED_BIT 0x1 -#define IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT 0x4 +#define FEATURE_CONTROL_LOCKED (1<<0) +#define FEATURE_CONTROL_VMXON_ENABLED (1<<2) #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10 -- cgit v1.2.3-70-g09d2 From defed7ed926eca9f178a632df05baa866abe754e Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Thu, 11 Sep 2008 15:27:50 +0800 Subject: x86: Move FEATURE_CONTROL bits to msr-index.h For MSR_IA32_FEATURE_CONTROL is already there. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.h | 3 --- include/asm-x86/msr-index.h | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h index 44cfab70644..3e010d21fdd 100644 --- a/arch/x86/kvm/vmx.h +++ b/arch/x86/kvm/vmx.h @@ -331,9 +331,6 @@ enum vmcs_field { #define AR_RESERVD_MASK 0xfffe0f00 -#define FEATURE_CONTROL_LOCKED (1<<0) -#define FEATURE_CONTROL_VMXON_ENABLED (1<<2) - #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10 diff --git a/include/asm-x86/msr-index.h b/include/asm-x86/msr-index.h index 0bb43301a20..dabd10f0bbe 100644 --- a/include/asm-x86/msr-index.h +++ b/include/asm-x86/msr-index.h @@ -178,6 +178,9 @@ #define MSR_IA32_EBL_CR_POWERON 0x0000002a #define MSR_IA32_FEATURE_CONTROL 0x0000003a +#define FEATURE_CONTROL_LOCKED (1<<0) +#define FEATURE_CONTROL_VMXON_ENABLED (1<<2) + #define MSR_IA32_APICBASE 0x0000001b #define MSR_IA32_APICBASE_BSP (1<<8) #define MSR_IA32_APICBASE_ENABLE (1<<11) -- cgit v1.2.3-70-g09d2 From 9c9fddd0e784346a6d0ce82ed20d9ad21f3c8a2c Mon Sep 17 00:00:00 2001 From: Guillaume Thouvenin Date: Fri, 12 Sep 2008 13:50:25 +0200 Subject: KVM: x86 emulator: Add DstAcc operand type Add DstAcc operand type. That means that there are 4 bits now for DstMask. "In the good old days cpus would have only one register that was able to fully participate in arithmetic operations, typically called A for Accumulator. The x86 retains this tradition by having special, shorter encodings for the A register (like the cmp opcode), and even some instructions that only operate on A (like mul). SrcAcc and DstAcc would accommodate these instructions by decoding A into the corresponding 'struct operand'." -- Avi Kivity Signed-off-by: Guillaume Thouvenin Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 50 +++++++++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 0c120c4c9c0..4390ec8c47a 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -47,25 +47,26 @@ #define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ #define DstReg (2<<1) /* Register operand. */ #define DstMem (3<<1) /* Memory operand. */ -#define DstMask (3<<1) +#define DstAcc (4<<1) /* Destination Accumulator */ +#define DstMask (7<<1) /* Source operand type. */ -#define SrcNone (0<<3) /* No source operand. */ -#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */ -#define SrcReg (1<<3) /* Register operand. */ -#define SrcMem (2<<3) /* Memory operand. */ -#define SrcMem16 (3<<3) /* Memory operand (16-bit). */ -#define SrcMem32 (4<<3) /* Memory operand (32-bit). */ -#define SrcImm (5<<3) /* Immediate operand. */ -#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */ -#define SrcMask (7<<3) +#define SrcNone (0<<4) /* No source operand. */ +#define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */ +#define SrcReg (1<<4) /* Register operand. */ +#define SrcMem (2<<4) /* Memory operand. */ +#define SrcMem16 (3<<4) /* Memory operand (16-bit). */ +#define SrcMem32 (4<<4) /* Memory operand (32-bit). */ +#define SrcImm (5<<4) /* Immediate operand. */ +#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ +#define SrcMask (7<<4) /* Generic ModRM decode. */ -#define ModRM (1<<6) +#define ModRM (1<<7) /* Destination is only written; never read. */ -#define Mov (1<<7) -#define BitOp (1<<8) -#define MemAbs (1<<9) /* Memory operand is absolute displacement */ -#define String (1<<10) /* String instruction (rep capable) */ -#define Stack (1<<11) /* Stack instruction (push/pop) */ +#define Mov (1<<8) +#define BitOp (1<<9) +#define MemAbs (1<<10) /* Memory operand is absolute displacement */ +#define String (1<<12) /* String instruction (rep capable) */ +#define Stack (1<<13) /* Stack instruction (push/pop) */ #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ #define GroupMask 0xff /* Group number stored in bits 0:7 */ @@ -1060,6 +1061,23 @@ done_prefixes: } c->dst.type = OP_MEM; break; + case DstAcc: + c->dst.type = OP_REG; + c->dst.bytes = c->op_bytes; + c->dst.ptr = &c->regs[VCPU_REGS_RAX]; + switch (c->op_bytes) { + case 1: + c->dst.val = *(u8 *)c->dst.ptr; + break; + case 2: + c->dst.val = *(u16 *)c->dst.ptr; + break; + case 4: + c->dst.val = *(u32 *)c->dst.ptr; + break; + } + c->dst.orig_val = c->dst.val; + break; } if (c->rip_relative) -- cgit v1.2.3-70-g09d2 From 8a9fee67fba585b4c4731a749367e1751ebf416c Mon Sep 17 00:00:00 2001 From: Guillaume Thouvenin Date: Fri, 12 Sep 2008 13:51:15 +0200 Subject: KVM: x86 emulator: Add cmp al, imm and cmp ax, imm instructions (ocodes 3c, 3d) Add decode entries for these opcodes; execution is already implemented. Signed-off-by: Guillaume Thouvenin Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 4390ec8c47a..2b43208a38b 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -108,7 +108,8 @@ static u16 opcode_table[256] = { /* 0x38 - 0x3F */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, + ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, + 0, 0, /* 0x40 - 0x47 */ DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, /* 0x48 - 0x4F */ -- cgit v1.2.3-70-g09d2 From aa3a816b6d0bd59e1a9c548cc7d2dd829f26534f Mon Sep 17 00:00:00 2001 From: Guillaume Thouvenin Date: Fri, 12 Sep 2008 13:52:18 +0200 Subject: KVM: x86 emulator: Use DstAcc for 'and' For instruction 'and al,imm' we use DstAcc instead of doing the emulation directly into the instruction's opcode. Signed-off-by: Guillaume Thouvenin Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 2b43208a38b..ea051173b0d 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -96,7 +96,7 @@ static u16 opcode_table[256] = { /* 0x20 - 0x27 */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - SrcImmByte, SrcImm, 0, 0, + DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, /* 0x28 - 0x2F */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, @@ -1392,27 +1392,10 @@ special_insn: sbb: /* sbb */ emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); break; - case 0x20 ... 0x23: + case 0x20 ... 0x25: and: /* and */ emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); break; - case 0x24: /* and al imm8 */ - c->dst.type = OP_REG; - c->dst.ptr = &c->regs[VCPU_REGS_RAX]; - c->dst.val = *(u8 *)c->dst.ptr; - c->dst.bytes = 1; - c->dst.orig_val = c->dst.val; - goto and; - case 0x25: /* and ax imm16, or eax imm32 */ - c->dst.type = OP_REG; - c->dst.bytes = c->op_bytes; - c->dst.ptr = &c->regs[VCPU_REGS_RAX]; - if (c->op_bytes == 2) - c->dst.val = *(u16 *)c->dst.ptr; - else - c->dst.val = *(u32 *)c->dst.ptr; - c->dst.orig_val = c->dst.val; - goto and; case 0x28 ... 0x2d: sub: /* sub */ emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); -- cgit v1.2.3-70-g09d2 From 62c476c7c7f25a5b245b9902a935636e6316e58c Mon Sep 17 00:00:00 2001 From: Ben-Ami Yassour Date: Sun, 14 Sep 2008 03:48:28 +0300 Subject: KVM: Device Assignment with VT-d Based on a patch by: Kay, Allen M This patch enables PCI device assignment based on VT-d support. When a device is assigned to the guest, the guest memory is pinned and the mapping is updated in the VT-d IOMMU. [Amit: Expose KVM_CAP_IOMMU so we can check if an IOMMU is present and also control enable/disable from userspace] Signed-off-by: Kay, Allen M Signed-off-by: Weidong Han Signed-off-by: Ben-Ami Yassour Signed-off-by: Amit Shah Acked-by: Mark Gross Signed-off-by: Avi Kivity --- arch/x86/kvm/Makefile | 3 + arch/x86/kvm/vtd.c | 198 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 14 ++++ include/asm-x86/kvm_host.h | 23 +----- include/linux/kvm.h | 3 + include/linux/kvm_host.h | 52 ++++++++++++ virt/kvm/kvm_main.c | 9 ++- 7 files changed, 281 insertions(+), 21 deletions(-) create mode 100644 arch/x86/kvm/vtd.c (limited to 'arch/x86') diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index d0e940bb6f4..3072b17447a 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -12,6 +12,9 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \ i8254.o +ifeq ($(CONFIG_DMAR),y) +kvm-objs += vtd.o +endif obj-$(CONFIG_KVM) += kvm.o kvm-intel-objs = vmx.o obj-$(CONFIG_KVM_INTEL) += kvm-intel.o diff --git a/arch/x86/kvm/vtd.c b/arch/x86/kvm/vtd.c new file mode 100644 index 00000000000..667bf3fb64b --- /dev/null +++ b/arch/x86/kvm/vtd.c @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) 2006-2008 Intel Corporation + * Copyright IBM Corporation, 2008 + * Author: Allen M. Kay + * Author: Weidong Han + * Author: Ben-Ami Yassour + */ + +#include +#include +#include +#include +#include + +static int kvm_iommu_unmap_memslots(struct kvm *kvm); +static void kvm_iommu_put_pages(struct kvm *kvm, + gfn_t base_gfn, unsigned long npages); + +int kvm_iommu_map_pages(struct kvm *kvm, + gfn_t base_gfn, unsigned long npages) +{ + gfn_t gfn = base_gfn; + pfn_t pfn; + int i, r; + struct dmar_domain *domain = kvm->arch.intel_iommu_domain; + + /* check if iommu exists and in use */ + if (!domain) + return 0; + + r = -EINVAL; + for (i = 0; i < npages; i++) { + /* check if already mapped */ + pfn = (pfn_t)intel_iommu_iova_to_pfn(domain, + gfn_to_gpa(gfn)); + if (pfn && !is_mmio_pfn(pfn)) + continue; + + pfn = gfn_to_pfn(kvm, gfn); + if (!is_mmio_pfn(pfn)) { + r = intel_iommu_page_mapping(domain, + gfn_to_gpa(gfn), + pfn_to_hpa(pfn), + PAGE_SIZE, + DMA_PTE_READ | + DMA_PTE_WRITE); + if (r) { + printk(KERN_DEBUG "kvm_iommu_map_pages:" + "iommu failed to map pfn=%lx\n", pfn); + goto unmap_pages; + } + } else { + printk(KERN_DEBUG "kvm_iommu_map_page:" + "invalid pfn=%lx\n", pfn); + goto unmap_pages; + } + gfn++; + } + return 0; + +unmap_pages: + kvm_iommu_put_pages(kvm, base_gfn, i); + return r; +} + +static int kvm_iommu_map_memslots(struct kvm *kvm) +{ + int i, r; + + down_read(&kvm->slots_lock); + for (i = 0; i < kvm->nmemslots; i++) { + r = kvm_iommu_map_pages(kvm, kvm->memslots[i].base_gfn, + kvm->memslots[i].npages); + if (r) + break; + } + up_read(&kvm->slots_lock); + return r; +} + +int kvm_iommu_map_guest(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev) +{ + struct pci_dev *pdev = NULL; + int r; + + if (!intel_iommu_found()) { + printk(KERN_ERR "%s: intel iommu not found\n", __func__); + return -ENODEV; + } + + printk(KERN_DEBUG "VT-d direct map: host bdf = %x:%x:%x\n", + assigned_dev->host_busnr, + PCI_SLOT(assigned_dev->host_devfn), + PCI_FUNC(assigned_dev->host_devfn)); + + pdev = assigned_dev->dev; + + if (pdev == NULL) { + if (kvm->arch.intel_iommu_domain) { + intel_iommu_domain_exit(kvm->arch.intel_iommu_domain); + kvm->arch.intel_iommu_domain = NULL; + } + return -ENODEV; + } + + kvm->arch.intel_iommu_domain = intel_iommu_domain_alloc(pdev); + if (!kvm->arch.intel_iommu_domain) + return -ENODEV; + + r = kvm_iommu_map_memslots(kvm); + if (r) + goto out_unmap; + + intel_iommu_detach_dev(kvm->arch.intel_iommu_domain, + pdev->bus->number, pdev->devfn); + + r = intel_iommu_context_mapping(kvm->arch.intel_iommu_domain, + pdev); + if (r) { + printk(KERN_ERR "Domain context map for %s failed", + pci_name(pdev)); + goto out_unmap; + } + return 0; + +out_unmap: + kvm_iommu_unmap_memslots(kvm); + return r; +} + +static void kvm_iommu_put_pages(struct kvm *kvm, + gfn_t base_gfn, unsigned long npages) +{ + gfn_t gfn = base_gfn; + pfn_t pfn; + struct dmar_domain *domain = kvm->arch.intel_iommu_domain; + int i; + + for (i = 0; i < npages; i++) { + pfn = (pfn_t)intel_iommu_iova_to_pfn(domain, + gfn_to_gpa(gfn)); + kvm_release_pfn_clean(pfn); + gfn++; + } +} + +static int kvm_iommu_unmap_memslots(struct kvm *kvm) +{ + int i; + down_read(&kvm->slots_lock); + for (i = 0; i < kvm->nmemslots; i++) { + kvm_iommu_put_pages(kvm, kvm->memslots[i].base_gfn, + kvm->memslots[i].npages); + } + up_read(&kvm->slots_lock); + + return 0; +} + +int kvm_iommu_unmap_guest(struct kvm *kvm) +{ + struct kvm_assigned_dev_kernel *entry; + struct dmar_domain *domain = kvm->arch.intel_iommu_domain; + + /* check if iommu exists and in use */ + if (!domain) + return 0; + + list_for_each_entry(entry, &kvm->arch.assigned_dev_head, list) { + printk(KERN_DEBUG "VT-d unmap: host bdf = %x:%x:%x\n", + entry->host_busnr, + PCI_SLOT(entry->host_devfn), + PCI_FUNC(entry->host_devfn)); + + /* detach kvm dmar domain */ + intel_iommu_detach_dev(domain, entry->host_busnr, + entry->host_devfn); + } + kvm_iommu_unmap_memslots(kvm); + intel_iommu_domain_exit(domain); + return 0; +} diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2134f3e0a51..c8a2793626e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -277,9 +278,18 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, list_add(&match->list, &kvm->arch.assigned_dev_head); + if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) { + r = kvm_iommu_map_guest(kvm, match); + if (r) + goto out_list_del; + } + out: mutex_unlock(&kvm->lock); return r; +out_list_del: + list_del(&match->list); + pci_release_regions(dev); out_disable: pci_disable_device(dev); out_put: @@ -1147,6 +1157,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PV_MMU: r = !tdp_enabled; break; + case KVM_CAP_IOMMU: + r = intel_iommu_found(); + break; default: r = 0; break; @@ -4282,6 +4295,7 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { + kvm_iommu_unmap_guest(kvm); kvm_free_assigned_devices(kvm); kvm_free_pit(kvm); kfree(kvm->arch.vpic); diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 68a3ac13afc..805629c0f15 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -331,26 +331,6 @@ struct kvm_mem_alias { gfn_t target_gfn; }; -struct kvm_irq_ack_notifier { - struct hlist_node link; - unsigned gsi; - void (*irq_acked)(struct kvm_irq_ack_notifier *kian); -}; - -struct kvm_assigned_dev_kernel { - struct kvm_irq_ack_notifier ack_notifier; - struct work_struct interrupt_work; - struct list_head list; - int assigned_dev_id; - int host_busnr; - int host_devfn; - int host_irq; - int guest_irq; - int irq_requested; - struct pci_dev *dev; - struct kvm *kvm; -}; - struct kvm_arch{ int naliases; struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; @@ -364,6 +344,7 @@ struct kvm_arch{ */ struct list_head active_mmu_pages; struct list_head assigned_dev_head; + struct dmar_domain *intel_iommu_domain; struct kvm_pic *vpic; struct kvm_ioapic *vioapic; struct kvm_pit *vpit; @@ -514,6 +495,8 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, gpa_t addr, unsigned long *ret); +int is_mmio_pfn(pfn_t pfn); + extern bool tdp_enabled; enum emulation_result { diff --git a/include/linux/kvm.h b/include/linux/kvm.h index ef4bc6f8977..4269be171fa 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -384,6 +384,7 @@ struct kvm_trace_rec { #define KVM_CAP_COALESCED_MMIO 15 #define KVM_CAP_SYNC_MMU 16 /* Changes to host mmap are reflected in guest */ #define KVM_CAP_DEVICE_ASSIGNMENT 17 +#define KVM_CAP_IOMMU 18 /* * ioctls for VM fds @@ -495,4 +496,6 @@ struct kvm_assigned_irq { __u32 flags; }; +#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) + #endif diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4b036430ea2..6252802c3cc 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -286,6 +286,53 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v); int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); void kvm_vcpu_kick(struct kvm_vcpu *vcpu); +struct kvm_irq_ack_notifier { + struct hlist_node link; + unsigned gsi; + void (*irq_acked)(struct kvm_irq_ack_notifier *kian); +}; + +struct kvm_assigned_dev_kernel { + struct kvm_irq_ack_notifier ack_notifier; + struct work_struct interrupt_work; + struct list_head list; + int assigned_dev_id; + int host_busnr; + int host_devfn; + int host_irq; + int guest_irq; + int irq_requested; + struct pci_dev *dev; + struct kvm *kvm; +}; + +#ifdef CONFIG_DMAR +int kvm_iommu_map_pages(struct kvm *kvm, gfn_t base_gfn, + unsigned long npages); +int kvm_iommu_map_guest(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev); +int kvm_iommu_unmap_guest(struct kvm *kvm); +#else /* CONFIG_DMAR */ +static inline int kvm_iommu_map_pages(struct kvm *kvm, + gfn_t base_gfn, + unsigned long npages) +{ + return 0; +} + +static inline int kvm_iommu_map_guest(struct kvm *kvm, + struct kvm_assigned_dev_kernel + *assigned_dev) +{ + return -ENODEV; +} + +static inline int kvm_iommu_unmap_guest(struct kvm *kvm) +{ + return 0; +} +#endif /* CONFIG_DMAR */ + static inline void kvm_guest_enter(void) { account_system_vtime(current); @@ -308,6 +355,11 @@ static inline gpa_t gfn_to_gpa(gfn_t gfn) return (gpa_t)gfn << PAGE_SHIFT; } +static inline hpa_t pfn_to_hpa(pfn_t pfn) +{ + return (hpa_t)pfn << PAGE_SHIFT; +} + static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu) { set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 63e661be040..f42d5c2a396 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -76,7 +77,7 @@ static inline int valid_vcpu(int n) return likely(n >= 0 && n < KVM_MAX_VCPUS); } -static inline int is_mmio_pfn(pfn_t pfn) +inline int is_mmio_pfn(pfn_t pfn) { if (pfn_valid(pfn)) return PageReserved(pfn_to_page(pfn)); @@ -578,6 +579,12 @@ int __kvm_set_memory_region(struct kvm *kvm, } kvm_free_physmem_slot(&old, &new); + + /* map the pages in iommu page table */ + r = kvm_iommu_map_pages(kvm, base_gfn, npages); + if (r) + goto out; + return 0; out_free: -- cgit v1.2.3-70-g09d2 From bfadaded0dc323a1cf3f08b5068f12955b54cbaa Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Tue, 16 Sep 2008 18:04:28 +0300 Subject: KVM: Device Assignment: Free device structures if IRQ allocation fails When an IRQ allocation fails, we free up the device structures and disable the device so that we can unregister the device in the userspace and not expose it to the guest at all. Signed-off-by: Amit Shah Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 86 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 45 insertions(+), 41 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c8a2793626e..61eddbeabeb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -166,6 +166,43 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) enable_irq(dev->host_irq); } +static void kvm_free_assigned_device(struct kvm *kvm, + struct kvm_assigned_dev_kernel + *assigned_dev) +{ + if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested) + free_irq(assigned_dev->host_irq, (void *)assigned_dev); + + kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier); + + if (cancel_work_sync(&assigned_dev->interrupt_work)) + /* We had pending work. That means we will have to take + * care of kvm_put_kvm. + */ + kvm_put_kvm(kvm); + + pci_release_regions(assigned_dev->dev); + pci_disable_device(assigned_dev->dev); + pci_dev_put(assigned_dev->dev); + + list_del(&assigned_dev->list); + kfree(assigned_dev); +} + +static void kvm_free_all_assigned_devices(struct kvm *kvm) +{ + struct list_head *ptr, *ptr2; + struct kvm_assigned_dev_kernel *assigned_dev; + + list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) { + assigned_dev = list_entry(ptr, + struct kvm_assigned_dev_kernel, + list); + + kvm_free_assigned_device(kvm, assigned_dev); + } +} + static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, struct kvm_assigned_irq *assigned_irq) @@ -194,8 +231,8 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, if (irqchip_in_kernel(kvm)) { if (!capable(CAP_SYS_RAWIO)) { - return -EPERM; - goto out; + r = -EPERM; + goto out_release; } if (assigned_irq->host_irq) @@ -214,17 +251,18 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, */ if (request_irq(match->host_irq, kvm_assigned_dev_intr, 0, "kvm_assigned_device", (void *)match)) { - printk(KERN_INFO "%s: couldn't allocate irq for pv " - "device\n", __func__); r = -EIO; - goto out; + goto out_release; } } match->irq_requested = true; -out: mutex_unlock(&kvm->lock); return r; +out_release: + mutex_unlock(&kvm->lock); + kvm_free_assigned_device(kvm, match); + return r; } static int kvm_vm_ioctl_assign_device(struct kvm *kvm, @@ -300,40 +338,6 @@ out_free: return r; } -static void kvm_free_assigned_devices(struct kvm *kvm) -{ - struct list_head *ptr, *ptr2; - struct kvm_assigned_dev_kernel *assigned_dev; - - list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) { - assigned_dev = list_entry(ptr, - struct kvm_assigned_dev_kernel, - list); - - if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested) { - free_irq(assigned_dev->host_irq, - (void *)assigned_dev); - - kvm_unregister_irq_ack_notifier(kvm, - &assigned_dev-> - ack_notifier); - } - - if (cancel_work_sync(&assigned_dev->interrupt_work)) - /* We had pending work. That means we will have to take - * care of kvm_put_kvm. - */ - kvm_put_kvm(kvm); - - pci_release_regions(assigned_dev->dev); - pci_disable_device(assigned_dev->dev); - pci_dev_put(assigned_dev->dev); - - list_del(&assigned_dev->list); - kfree(assigned_dev); - } -} - unsigned long segment_base(u16 selector) { struct descriptor_table gdt; @@ -4296,7 +4300,7 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { kvm_iommu_unmap_guest(kvm); - kvm_free_assigned_devices(kvm); + kvm_free_all_assigned_devices(kvm); kvm_free_pit(kvm); kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); -- cgit v1.2.3-70-g09d2 From 4c2155ce81c193788082d4b8cdbc26d79edebc58 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 16 Sep 2008 20:54:47 -0300 Subject: KVM: switch to get_user_pages_fast Convert gfn_to_pfn to use get_user_pages_fast, which can do lockless pagetable lookups on x86. Kernel compilation on 4-way guest is 3.7% faster on VMX. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/powerpc/kvm/44x_tlb.c | 2 -- arch/x86/kvm/mmu.c | 23 +++++++++-------------- arch/x86/kvm/paging_tmpl.h | 8 +------- arch/x86/kvm/vmx.c | 4 ---- arch/x86/kvm/x86.c | 6 ------ virt/kvm/kvm_main.c | 10 +++++----- 6 files changed, 15 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c index 7b11fd7be54..2e227a412bc 100644 --- a/arch/powerpc/kvm/44x_tlb.c +++ b/arch/powerpc/kvm/44x_tlb.c @@ -147,9 +147,7 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid, stlbe = &vcpu->arch.shadow_tlb[victim]; /* Get reference to new page. */ - down_read(¤t->mm->mmap_sem); new_page = gfn_to_page(vcpu->kvm, gfn); - up_read(¤t->mm->mmap_sem); if (is_error_page(new_page)) { printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", gfn); kvm_release_page_clean(new_page); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index bce3e25ec79..5779a2323e2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -405,16 +405,19 @@ static int host_largepage_backed(struct kvm *kvm, gfn_t gfn) { struct vm_area_struct *vma; unsigned long addr; + int ret = 0; addr = gfn_to_hva(kvm, gfn); if (kvm_is_error_hva(addr)) - return 0; + return ret; + down_read(¤t->mm->mmap_sem); vma = find_vma(current->mm, addr); if (vma && is_vm_hugetlb_page(vma)) - return 1; + ret = 1; + up_read(¤t->mm->mmap_sem); - return 0; + return ret; } static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) @@ -1140,9 +1143,7 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) if (gpa == UNMAPPED_GVA) return NULL; - down_read(¤t->mm->mmap_sem); page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); - up_read(¤t->mm->mmap_sem); return page; } @@ -1330,16 +1331,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) pfn_t pfn; unsigned long mmu_seq; - down_read(¤t->mm->mmap_sem); if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { gfn &= ~(KVM_PAGES_PER_HPAGE-1); largepage = 1; } mmu_seq = vcpu->kvm->mmu_notifier_seq; - /* implicit mb(), we'll read before PT lock is unlocked */ + smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); - up_read(¤t->mm->mmap_sem); /* mmio */ if (is_error_pfn(pfn)) { @@ -1488,15 +1487,13 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, if (r) return r; - down_read(¤t->mm->mmap_sem); if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { gfn &= ~(KVM_PAGES_PER_HPAGE-1); largepage = 1; } mmu_seq = vcpu->kvm->mmu_notifier_seq; - /* implicit mb(), we'll read before PT lock is unlocked */ + smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); - up_read(¤t->mm->mmap_sem); if (is_error_pfn(pfn)) { kvm_release_pfn_clean(pfn); return 1; @@ -1809,15 +1806,13 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, return; gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; - down_read(¤t->mm->mmap_sem); if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) { gfn &= ~(KVM_PAGES_PER_HPAGE-1); vcpu->arch.update_pte.largepage = 1; } vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; - /* implicit mb(), we'll read before PT lock is unlocked */ + smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); - up_read(¤t->mm->mmap_sem); if (is_error_pfn(pfn)) { kvm_release_pfn_clean(pfn); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index b671f61be41..6dd08e096e2 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -102,14 +102,10 @@ static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, pt_element_t *table; struct page *page; - down_read(¤t->mm->mmap_sem); page = gfn_to_page(kvm, table_gfn); - up_read(¤t->mm->mmap_sem); table = kmap_atomic(page, KM_USER0); - ret = CMPXCHG(&table[index], orig_pte, new_pte); - kunmap_atomic(table, KM_USER0); kvm_release_page_dirty(page); @@ -418,7 +414,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, return 0; } - down_read(¤t->mm->mmap_sem); if (walker.level == PT_DIRECTORY_LEVEL) { gfn_t large_gfn; large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1); @@ -428,9 +423,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, } } mmu_seq = vcpu->kvm->mmu_notifier_seq; - /* implicit mb(), we'll read before PT lock is unlocked */ + smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); - up_read(¤t->mm->mmap_sem); /* mmio */ if (is_error_pfn(pfn)) { diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 046a91b5a4b..025bf4011ab 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2010,9 +2010,7 @@ static int alloc_apic_access_page(struct kvm *kvm) if (r) goto out; - down_read(¤t->mm->mmap_sem); kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); - up_read(¤t->mm->mmap_sem); out: up_write(&kvm->slots_lock); return r; @@ -2034,10 +2032,8 @@ static int alloc_identity_pagetable(struct kvm *kvm) if (r) goto out; - down_read(¤t->mm->mmap_sem); kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT); - up_read(¤t->mm->mmap_sem); out: up_write(&kvm->slots_lock); return r; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 61eddbeabeb..108f07267e8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -946,10 +946,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) /* ...but clean it before doing the actual write */ vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); - down_read(¤t->mm->mmap_sem); vcpu->arch.time_page = gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); - up_read(¤t->mm->mmap_sem); if (is_error_page(vcpu->arch.time_page)) { kvm_release_page_clean(vcpu->arch.time_page); @@ -2322,9 +2320,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, val = *(u64 *)new; - down_read(¤t->mm->mmap_sem); page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); - up_read(¤t->mm->mmap_sem); kaddr = kmap_atomic(page, KM_USER0); set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); @@ -3089,9 +3085,7 @@ static void vapic_enter(struct kvm_vcpu *vcpu) if (!apic || !apic->vapic_addr) return; - down_read(¤t->mm->mmap_sem); page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); - up_read(¤t->mm->mmap_sem); vcpu->arch.apic->vapic_page = page; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2907d05cfcc..cd34f73513d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -723,9 +723,6 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(gfn_to_hva); -/* - * Requires current->mm->mmap_sem to be held - */ pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) { struct page *page[1]; @@ -741,20 +738,23 @@ pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) return page_to_pfn(bad_page); } - npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page, - NULL); + npages = get_user_pages_fast(addr, 1, 1, page); if (unlikely(npages != 1)) { struct vm_area_struct *vma; + down_read(¤t->mm->mmap_sem); vma = find_vma(current->mm, addr); + if (vma == NULL || addr < vma->vm_start || !(vma->vm_flags & VM_PFNMAP)) { + up_read(¤t->mm->mmap_sem); get_page(bad_page); return page_to_pfn(bad_page); } pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + up_read(¤t->mm->mmap_sem); BUG_ON(!is_mmio_pfn(pfn)); } else pfn = page_to_pfn(page[0]); -- cgit v1.2.3-70-g09d2 From 2259e3a7a6089007839cd4bbf7c9867190c67238 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Fri, 22 Aug 2008 13:29:17 -0700 Subject: KVM: x86.c make kvm_load_realmode_segment static Noticed by sparse: arch/x86/kvm/x86.c:3591:5: warning: symbol 'kvm_load_realmode_segment' was not declared. Should it be static? Signed-off-by: Harvey Harrison Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 108f07267e8..1b738cb0283 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3611,7 +3611,7 @@ static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, return 0; } -int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) +static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) { struct kvm_segment segvar = { .base = selector << 4, -- cgit v1.2.3-70-g09d2 From af2152f5457448bd90cb019c108e0a85e716fdbe Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 22 Sep 2008 14:28:53 +0300 Subject: KVM: don't enter guest after SIPI was received by a CPU The vcpu should process pending SIPI message before entering guest mode again. kvm_arch_vcpu_runnable() returns true if the vcpu is in SIPI state, so we can't call it here. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1b738cb0283..08edeabf15e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3233,7 +3233,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) r = 1; while (r > 0) { - if (kvm_arch_vcpu_runnable(vcpu)) + if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) r = vcpu_enter_guest(vcpu, kvm_run); else { up_read(&vcpu->kvm->slots_lock); -- cgit v1.2.3-70-g09d2 From a08546001c2b0f584ffc81987340943a7d6d6acb Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Tue, 23 Sep 2008 11:01:45 -0700 Subject: x86: pvclock: fix shadowed variable warning arch/x86/kernel/pvclock.c:102:6: warning: symbol 'tsc_khz' shadows an earlier one include/asm/tsc.h:18:21: originally declared here Signed-off-by: Harvey Harrison Signed-off-by: Andrew Morton Signed-off-by: Avi Kivity --- arch/x86/kernel/pvclock.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 1c54b5fb7ae..4f9c55f3a7c 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -99,14 +99,14 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) { - u64 tsc_khz = 1000000ULL << 32; + u64 pv_tsc_khz = 1000000ULL << 32; - do_div(tsc_khz, src->tsc_to_system_mul); + do_div(pv_tsc_khz, src->tsc_to_system_mul); if (src->tsc_shift < 0) - tsc_khz <<= -src->tsc_shift; + pv_tsc_khz <<= -src->tsc_shift; else - tsc_khz >>= src->tsc_shift; - return tsc_khz; + pv_tsc_khz >>= src->tsc_shift; + return pv_tsc_khz; } cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) -- cgit v1.2.3-70-g09d2 From 93a423e7045cf3cf69f960ff307edda1afcd7b41 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 23 Sep 2008 13:18:29 -0300 Subject: KVM: MMU: flush remote TLBs on large->normal entry overwrite It is necessary to flush all TLB's when a large spte entry is overwritten with a normal page directory pointer. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 6dd08e096e2..e9fbaa44d44 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -310,8 +310,11 @@ static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw, if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) return 0; - if (is_large_pte(*sptep)) + if (is_large_pte(*sptep)) { + set_shadow_pte(sptep, shadow_trap_nonpresent_pte); + kvm_flush_remote_tlbs(vcpu->kvm); rmap_remove(vcpu->kvm, sptep); + } if (level == PT_DIRECTORY_LEVEL && gw->level == PT_DIRECTORY_LEVEL) { metaphysical = 1; -- cgit v1.2.3-70-g09d2 From 1e73f9dd885957bf0c7bb5e63b350d5aeb06b726 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 23 Sep 2008 13:18:30 -0300 Subject: KVM: MMU: split mmu_set_spte Split the spte entry creation code into a new set_spte function. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 101 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 57 insertions(+), 44 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5779a2323e2..9ad4cc55389 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1148,44 +1148,13 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) return page; } -static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, - unsigned pt_access, unsigned pte_access, - int user_fault, int write_fault, int dirty, - int *ptwrite, int largepage, gfn_t gfn, - pfn_t pfn, bool speculative) +static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, + unsigned pte_access, int user_fault, + int write_fault, int dirty, int largepage, + gfn_t gfn, pfn_t pfn, bool speculative) { u64 spte; - int was_rmapped = 0; - int was_writeble = is_writeble_pte(*shadow_pte); - - pgprintk("%s: spte %llx access %x write_fault %d" - " user_fault %d gfn %lx\n", - __func__, *shadow_pte, pt_access, - write_fault, user_fault, gfn); - - if (is_rmap_pte(*shadow_pte)) { - /* - * If we overwrite a PTE page pointer with a 2MB PMD, unlink - * the parent of the now unreachable PTE. - */ - if (largepage && !is_large_pte(*shadow_pte)) { - struct kvm_mmu_page *child; - u64 pte = *shadow_pte; - - child = page_header(pte & PT64_BASE_ADDR_MASK); - mmu_page_remove_parent_pte(child, shadow_pte); - } else if (pfn != spte_to_pfn(*shadow_pte)) { - pgprintk("hfn old %lx new %lx\n", - spte_to_pfn(*shadow_pte), pfn); - rmap_remove(vcpu->kvm, shadow_pte); - } else { - if (largepage) - was_rmapped = is_large_pte(*shadow_pte); - else - was_rmapped = 1; - } - } - + int ret = 0; /* * We don't set the accessed bit, since we sometimes want to see * whether the guest actually used the pte (in order to detect @@ -1218,26 +1187,70 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, (largepage && has_wrprotected_page(vcpu->kvm, gfn))) { pgprintk("%s: found shadow page for %lx, marking ro\n", __func__, gfn); + ret = 1; pte_access &= ~ACC_WRITE_MASK; if (is_writeble_pte(spte)) { spte &= ~PT_WRITABLE_MASK; kvm_x86_ops->tlb_flush(vcpu); } - if (write_fault) - *ptwrite = 1; } } if (pte_access & ACC_WRITE_MASK) mark_page_dirty(vcpu->kvm, gfn); - pgprintk("%s: setting spte %llx\n", __func__, spte); - pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", - (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB", - (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte); set_shadow_pte(shadow_pte, spte); - if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK) - && (spte & PT_PRESENT_MASK)) + return ret; +} + + +static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, + unsigned pt_access, unsigned pte_access, + int user_fault, int write_fault, int dirty, + int *ptwrite, int largepage, gfn_t gfn, + pfn_t pfn, bool speculative) +{ + int was_rmapped = 0; + int was_writeble = is_writeble_pte(*shadow_pte); + + pgprintk("%s: spte %llx access %x write_fault %d" + " user_fault %d gfn %lx\n", + __func__, *shadow_pte, pt_access, + write_fault, user_fault, gfn); + + if (is_rmap_pte(*shadow_pte)) { + /* + * If we overwrite a PTE page pointer with a 2MB PMD, unlink + * the parent of the now unreachable PTE. + */ + if (largepage && !is_large_pte(*shadow_pte)) { + struct kvm_mmu_page *child; + u64 pte = *shadow_pte; + + child = page_header(pte & PT64_BASE_ADDR_MASK); + mmu_page_remove_parent_pte(child, shadow_pte); + } else if (pfn != spte_to_pfn(*shadow_pte)) { + pgprintk("hfn old %lx new %lx\n", + spte_to_pfn(*shadow_pte), pfn); + rmap_remove(vcpu->kvm, shadow_pte); + } else { + if (largepage) + was_rmapped = is_large_pte(*shadow_pte); + else + was_rmapped = 1; + } + } + if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, + dirty, largepage, gfn, pfn, speculative)) + if (write_fault) + *ptwrite = 1; + + pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte); + pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", + is_large_pte(*shadow_pte)? "2MB" : "4kB", + is_present_pte(*shadow_pte)?"RW":"R", gfn, + *shadow_pte, shadow_pte); + if (!was_rmapped && is_large_pte(*shadow_pte)) ++vcpu->kvm->stat.lpages; page_header_update_slot(vcpu->kvm, shadow_pte, gfn); -- cgit v1.2.3-70-g09d2 From a378b4e64c0fef2d9e53214db167878b7673a7a3 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 23 Sep 2008 13:18:31 -0300 Subject: KVM: MMU: move local TLB flush to mmu_set_spte Since the sync page path can collapse flushes. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9ad4cc55389..23752ef0839 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1189,10 +1189,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, __func__, gfn); ret = 1; pte_access &= ~ACC_WRITE_MASK; - if (is_writeble_pte(spte)) { + if (is_writeble_pte(spte)) spte &= ~PT_WRITABLE_MASK; - kvm_x86_ops->tlb_flush(vcpu); - } } } @@ -1241,9 +1239,11 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, } } if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, - dirty, largepage, gfn, pfn, speculative)) + dirty, largepage, gfn, pfn, speculative)) { if (write_fault) *ptwrite = 1; + kvm_x86_ops->tlb_flush(vcpu); + } pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte); pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", -- cgit v1.2.3-70-g09d2 From 38187c830cab84daecb41169948467f1f19317e3 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 23 Sep 2008 13:18:32 -0300 Subject: KVM: MMU: do not write-protect large mappings There is not much point in write protecting large mappings. This can only happen when a page is shadowed during the window between is_largepage_backed and mmu_lock acquision. Zap the entry instead, so the next pagefault will find a shadowed page via is_largepage_backed and fallback to 4k translations. Simplifies out of sync shadow. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 23752ef0839..731e6fe9cb0 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1180,11 +1180,16 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, || (write_fault && !is_write_protection(vcpu) && !user_fault)) { struct kvm_mmu_page *shadow; + if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) { + ret = 1; + spte = shadow_trap_nonpresent_pte; + goto set_pte; + } + spte |= PT_WRITABLE_MASK; shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); - if (shadow || - (largepage && has_wrprotected_page(vcpu->kvm, gfn))) { + if (shadow) { pgprintk("%s: found shadow page for %lx, marking ro\n", __func__, gfn); ret = 1; @@ -1197,6 +1202,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, if (pte_access & ACC_WRITE_MASK) mark_page_dirty(vcpu->kvm, gfn); +set_pte: set_shadow_pte(shadow_pte, spte); return ret; } -- cgit v1.2.3-70-g09d2 From e8bc217aef67d41d767ede6e7a7eb10f1d47c86c Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 23 Sep 2008 13:18:33 -0300 Subject: KVM: MMU: mode specific sync_page Examine guest pagetable and bring the shadow back in sync. Caller is responsible for local TLB flush before re-entering guest mode. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 10 +++++++++ arch/x86/kvm/paging_tmpl.h | 54 ++++++++++++++++++++++++++++++++++++++++++++++ include/asm-x86/kvm_host.h | 2 ++ 3 files changed, 66 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 731e6fe9cb0..90f01169c8f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -871,6 +871,12 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, sp->spt[i] = shadow_trap_nonpresent_pte; } +static int nonpaging_sync_page(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp) +{ + return 1; +} + static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) { unsigned index; @@ -1547,6 +1553,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) context->gva_to_gpa = nonpaging_gva_to_gpa; context->free = nonpaging_free; context->prefetch_page = nonpaging_prefetch_page; + context->sync_page = nonpaging_sync_page; context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; @@ -1594,6 +1601,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) context->page_fault = paging64_page_fault; context->gva_to_gpa = paging64_gva_to_gpa; context->prefetch_page = paging64_prefetch_page; + context->sync_page = paging64_sync_page; context->free = paging_free; context->root_level = level; context->shadow_root_level = level; @@ -1615,6 +1623,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) context->gva_to_gpa = paging32_gva_to_gpa; context->free = paging_free; context->prefetch_page = paging32_prefetch_page; + context->sync_page = paging32_sync_page; context->root_level = PT32_ROOT_LEVEL; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; @@ -1634,6 +1643,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->page_fault = tdp_page_fault; context->free = nonpaging_free; context->prefetch_page = nonpaging_prefetch_page; + context->sync_page = nonpaging_sync_page; context->shadow_root_level = kvm_x86_ops->get_tdp_level(); context->root_hpa = INVALID_PAGE; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index e9fbaa44d44..776fb6d2fd8 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -507,6 +507,60 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, } } +/* + * Using the cached information from sp->gfns is safe because: + * - The spte has a reference to the struct page, so the pfn for a given gfn + * can't change unless all sptes pointing to it are nuked first. + * - Alias changes zap the entire shadow cache. + */ +static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + int i, offset, nr_present; + + offset = nr_present = 0; + + if (PTTYPE == 32) + offset = sp->role.quadrant << PT64_LEVEL_BITS; + + for (i = 0; i < PT64_ENT_PER_PAGE; i++) { + unsigned pte_access; + pt_element_t gpte; + gpa_t pte_gpa; + gfn_t gfn = sp->gfns[i]; + + if (!is_shadow_present_pte(sp->spt[i])) + continue; + + pte_gpa = gfn_to_gpa(sp->gfn); + pte_gpa += (i+offset) * sizeof(pt_element_t); + + if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, + sizeof(pt_element_t))) + return -EINVAL; + + if (gpte_to_gfn(gpte) != gfn || !is_present_pte(gpte) || + !(gpte & PT_ACCESSED_MASK)) { + u64 nonpresent; + + rmap_remove(vcpu->kvm, &sp->spt[i]); + if (is_present_pte(gpte)) + nonpresent = shadow_trap_nonpresent_pte; + else + nonpresent = shadow_notrap_nonpresent_pte; + set_shadow_pte(&sp->spt[i], nonpresent); + continue; + } + + nr_present++; + pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, + is_dirty_pte(gpte), 0, gfn, + spte_to_pfn(sp->spt[i]), true); + } + + return !nr_present; +} + #undef pt_element_t #undef guest_walker #undef shadow_walker diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 805629c0f15..8bad9bd9b37 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -220,6 +220,8 @@ struct kvm_mmu { gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); void (*prefetch_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page); + int (*sync_page)(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp); hpa_t root_hpa; int root_level; int shadow_root_level; -- cgit v1.2.3-70-g09d2 From 0ba73cdadb8ac172f396df7e23c4a9cebd59b550 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 23 Sep 2008 13:18:34 -0300 Subject: KVM: MMU: sync roots on mmu reload Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 36 ++++++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 1 + include/asm-x86/kvm_host.h | 1 + 3 files changed, 38 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 90f01169c8f..9d8c4bb68a8 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1471,6 +1471,41 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); } +static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ +} + +static void mmu_sync_roots(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_mmu_page *sp; + + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return; + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + hpa_t root = vcpu->arch.mmu.root_hpa; + sp = page_header(root); + mmu_sync_children(vcpu, sp); + return; + } + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->arch.mmu.pae_root[i]; + + if (root) { + root &= PT64_BASE_ADDR_MASK; + sp = page_header(root); + mmu_sync_children(vcpu, sp); + } + } +} + +void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) +{ + spin_lock(&vcpu->kvm->mmu_lock); + mmu_sync_roots(vcpu); + spin_unlock(&vcpu->kvm->mmu_lock); +} + static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) { return vaddr; @@ -1715,6 +1750,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); mmu_alloc_roots(vcpu); + mmu_sync_roots(vcpu); spin_unlock(&vcpu->kvm->mmu_lock); kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); kvm_mmu_flush_tlb(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 08edeabf15e..88e6d9abbd2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -594,6 +594,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4); void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { + kvm_mmu_sync_roots(vcpu); kvm_mmu_flush_tlb(vcpu); return; } diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 8bad9bd9b37..475d8ab83bf 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -584,6 +584,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); +void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); -- cgit v1.2.3-70-g09d2 From a7052897b3bcd568a9f5bfaa558957039e7e7ec0 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 23 Sep 2008 13:18:35 -0300 Subject: KVM: x86: trap invlpg With pages out of sync invlpg needs to be trapped. For now simply nuke the entry. Untested on AMD. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 18 ++++++++++++++++++ arch/x86/kvm/paging_tmpl.h | 25 +++++++++++++++++++++++++ arch/x86/kvm/svm.c | 13 +++++++++++-- arch/x86/kvm/vmx.c | 19 ++++++++++++++++--- arch/x86/kvm/x86.c | 1 + include/asm-x86/kvm_host.h | 2 ++ 6 files changed, 73 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9d8c4bb68a8..e89af1df4fc 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -877,6 +877,10 @@ static int nonpaging_sync_page(struct kvm_vcpu *vcpu, return 1; } +static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) +{ +} + static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) { unsigned index; @@ -1589,6 +1593,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) context->free = nonpaging_free; context->prefetch_page = nonpaging_prefetch_page; context->sync_page = nonpaging_sync_page; + context->invlpg = nonpaging_invlpg; context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; @@ -1637,6 +1642,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) context->gva_to_gpa = paging64_gva_to_gpa; context->prefetch_page = paging64_prefetch_page; context->sync_page = paging64_sync_page; + context->invlpg = paging64_invlpg; context->free = paging_free; context->root_level = level; context->shadow_root_level = level; @@ -1659,6 +1665,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) context->free = paging_free; context->prefetch_page = paging32_prefetch_page; context->sync_page = paging32_sync_page; + context->invlpg = paging32_invlpg; context->root_level = PT32_ROOT_LEVEL; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; @@ -1679,6 +1686,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->free = nonpaging_free; context->prefetch_page = nonpaging_prefetch_page; context->sync_page = nonpaging_sync_page; + context->invlpg = nonpaging_invlpg; context->shadow_root_level = kvm_x86_ops->get_tdp_level(); context->root_hpa = INVALID_PAGE; @@ -2071,6 +2079,16 @@ out: } EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); +void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) +{ + spin_lock(&vcpu->kvm->mmu_lock); + vcpu->arch.mmu.invlpg(vcpu, gva); + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_mmu_flush_tlb(vcpu); + ++vcpu->stat.invlpg; +} +EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); + void kvm_enable_tdp(void) { tdp_enabled = true; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 776fb6d2fd8..dc169e8148b 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -461,6 +461,31 @@ out_unlock: return 0; } +static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw, + struct kvm_vcpu *vcpu, u64 addr, + u64 *sptep, int level) +{ + + if (level == PT_PAGE_TABLE_LEVEL) { + if (is_shadow_present_pte(*sptep)) + rmap_remove(vcpu->kvm, sptep); + set_shadow_pte(sptep, shadow_trap_nonpresent_pte); + return 1; + } + if (!is_shadow_present_pte(*sptep)) + return 1; + return 0; +} + +static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) +{ + struct shadow_walker walker = { + .walker = { .entry = FNAME(shadow_invlpg_entry), }, + }; + + walk_shadow(&walker.walker, vcpu, gva); +} + static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) { struct guest_walker walker; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 9b54550fa4d..9c4ce657d96 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -525,6 +525,7 @@ static void init_vmcb(struct vcpu_svm *svm) (1ULL << INTERCEPT_CPUID) | (1ULL << INTERCEPT_INVD) | (1ULL << INTERCEPT_HLT) | + (1ULL << INTERCEPT_INVLPG) | (1ULL << INTERCEPT_INVLPGA) | (1ULL << INTERCEPT_IOIO_PROT) | (1ULL << INTERCEPT_MSR_PROT) | @@ -589,7 +590,8 @@ static void init_vmcb(struct vcpu_svm *svm) if (npt_enabled) { /* Setup VMCB for Nested Paging */ control->nested_ctl = 1; - control->intercept &= ~(1ULL << INTERCEPT_TASK_SWITCH); + control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) | + (1ULL << INTERCEPT_INVLPG)); control->intercept_exceptions &= ~(1 << PF_VECTOR); control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK| INTERCEPT_CR3_MASK); @@ -1164,6 +1166,13 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return 1; } +static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE) + pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); + return 1; +} + static int emulate_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { @@ -1417,7 +1426,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, [SVM_EXIT_CPUID] = cpuid_interception, [SVM_EXIT_INVD] = emulate_on_interception, [SVM_EXIT_HLT] = halt_interception, - [SVM_EXIT_INVLPG] = emulate_on_interception, + [SVM_EXIT_INVLPG] = invlpg_interception, [SVM_EXIT_INVLPGA] = invalid_op_interception, [SVM_EXIT_IOIO] = io_interception, [SVM_EXIT_MSR] = msr_interception, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 025bf4011ab..4556cc3715b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1130,7 +1130,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) CPU_BASED_CR3_STORE_EXITING | CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MOV_DR_EXITING | - CPU_BASED_USE_TSC_OFFSETING; + CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_INVLPG_EXITING; opt = CPU_BASED_TPR_SHADOW | CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; @@ -1159,9 +1160,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; #endif if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { - /* CR3 accesses don't need to cause VM Exits when EPT enabled */ + /* CR3 accesses and invlpg don't need to cause VM Exits when EPT + enabled */ min &= ~(CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING); + CPU_BASED_CR3_STORE_EXITING | + CPU_BASED_INVLPG_EXITING); if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, &_cpu_based_exec_control) < 0) return -EIO; @@ -2790,6 +2793,15 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } +static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); + + kvm_mmu_invlpg(vcpu, exit_qualification); + skip_emulated_instruction(vcpu); + return 1; +} + static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { skip_emulated_instruction(vcpu); @@ -2958,6 +2970,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, [EXIT_REASON_MSR_WRITE] = handle_wrmsr, [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, [EXIT_REASON_HLT] = handle_halt, + [EXIT_REASON_INVLPG] = handle_invlpg, [EXIT_REASON_VMCALL] = handle_vmcall, [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, [EXIT_REASON_APIC_ACCESS] = handle_apic_access, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 88e6d9abbd2..efee85ba07e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2341,6 +2341,7 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) { + kvm_mmu_invlpg(vcpu, address); return X86EMUL_CONTINUE; } diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 475d8ab83bf..8b935cc4c14 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -222,6 +222,7 @@ struct kvm_mmu { struct kvm_mmu_page *page); int (*sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); + void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); hpa_t root_hpa; int root_level; int shadow_root_level; @@ -591,6 +592,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); int kvm_fix_hypercall(struct kvm_vcpu *vcpu); int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code); +void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); void kvm_enable_tdp(void); void kvm_disable_tdp(void); -- cgit v1.2.3-70-g09d2 From ad8cfbe3fffdc09704f0808fde3934855620d545 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 23 Sep 2008 13:18:36 -0300 Subject: KVM: MMU: mmu_parent_walk Introduce a function to walk all parents of a given page, invoking a handler. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e89af1df4fc..b82abee78f1 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -147,6 +147,8 @@ struct kvm_shadow_walk { u64 addr, u64 *spte, int level); }; +typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); + static struct kmem_cache *pte_chain_cache; static struct kmem_cache *rmap_desc_cache; static struct kmem_cache *mmu_page_header_cache; @@ -862,6 +864,31 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, BUG(); } + +static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + mmu_parent_walk_fn fn) +{ + struct kvm_pte_chain *pte_chain; + struct hlist_node *node; + struct kvm_mmu_page *parent_sp; + int i; + + if (!sp->multimapped && sp->parent_pte) { + parent_sp = page_header(__pa(sp->parent_pte)); + fn(vcpu, parent_sp); + mmu_parent_walk(vcpu, parent_sp, fn); + return; + } + hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) + for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { + if (!pte_chain->parent_ptes[i]) + break; + parent_sp = page_header(__pa(pte_chain->parent_ptes[i])); + fn(vcpu, parent_sp); + mmu_parent_walk(vcpu, parent_sp, fn); + } +} + static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { -- cgit v1.2.3-70-g09d2 From 0738541396be165995c7f2387746eb0b47024fec Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 23 Sep 2008 13:18:37 -0300 Subject: KVM: MMU: awareness of new kvm_mmu_zap_page behaviour kvm_mmu_zap_page will soon zap the unsynced children of a page. Restart list walk in such case. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b82abee78f1..c9b4b902527 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1078,7 +1078,7 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) } } -static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) +static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) { ++kvm->stat.mmu_shadow_zapped; kvm_mmu_page_unlink_children(kvm, sp); @@ -1095,6 +1095,7 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_reload_remote_mmus(kvm); } kvm_mmu_reset_last_pte_updated(kvm); + return 0; } /* @@ -1147,8 +1148,9 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) if (sp->gfn == gfn && !sp->role.metaphysical) { pgprintk("%s: gfn %lx role %x\n", __func__, gfn, sp->role.word); - kvm_mmu_zap_page(kvm, sp); r = 1; + if (kvm_mmu_zap_page(kvm, sp)) + n = bucket->first; } return r; } @@ -1992,7 +1994,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, */ pgprintk("misaligned: gpa %llx bytes %d role %x\n", gpa, bytes, sp->role.word); - kvm_mmu_zap_page(vcpu->kvm, sp); + if (kvm_mmu_zap_page(vcpu->kvm, sp)) + n = bucket->first; ++vcpu->kvm->stat.mmu_flooded; continue; } @@ -2226,7 +2229,9 @@ void kvm_mmu_zap_all(struct kvm *kvm) spin_lock(&kvm->mmu_lock); list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) - kvm_mmu_zap_page(kvm, sp); + if (kvm_mmu_zap_page(kvm, sp)) + node = container_of(kvm->arch.active_mmu_pages.next, + struct kvm_mmu_page, link); spin_unlock(&kvm->mmu_lock); kvm_flush_remote_tlbs(kvm); -- cgit v1.2.3-70-g09d2 From 6844dec6948679d084f054235fee19ba4e3a3096 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 23 Sep 2008 13:18:38 -0300 Subject: KVM: MMU: mmu_convert_notrap helper Need to convert shadow_notrap_nonpresent -> shadow_trap_nonpresent when unsyncing pages. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c9b4b902527..57c7580e7f9 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1173,6 +1173,20 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) __set_bit(slot, &sp->slot_bitmap); } +static void mmu_convert_notrap(struct kvm_mmu_page *sp) +{ + int i; + u64 *pt = sp->spt; + + if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte) + return; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + if (pt[i] == shadow_notrap_nonpresent_pte) + set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte); + } +} + struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) { struct page *page; -- cgit v1.2.3-70-g09d2 From 4731d4c7a07769cf2926c327177b97bb8c68cafc Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 23 Sep 2008 13:18:39 -0300 Subject: KVM: MMU: out of sync shadow core Allow guest pagetables to go out of sync. Instead of emulating write accesses to guest pagetables, or unshadowing them, we un-write-protect the page table and allow the guest to modify it at will. We rely on invlpg executions to synchronize individual ptes, and will synchronize the entire pagetable on tlb flushes. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 210 +++++++++++++++++++++++++++++++++++++++++---- arch/x86/kvm/paging_tmpl.h | 2 +- arch/x86/kvm/x86.c | 3 + include/asm-x86/kvm_host.h | 3 + include/linux/kvm_host.h | 1 + 5 files changed, 201 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 57c7580e7f9..d88659ae777 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -147,6 +147,10 @@ struct kvm_shadow_walk { u64 addr, u64 *spte, int level); }; +struct kvm_unsync_walk { + int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk); +}; + typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); static struct kmem_cache *pte_chain_cache; @@ -654,8 +658,6 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) if (write_protected) kvm_flush_remote_tlbs(kvm); - - account_shadowed(kvm, gfn); } static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) @@ -908,6 +910,41 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { } +static int mmu_unsync_walk(struct kvm_mmu_page *sp, + struct kvm_unsync_walk *walker) +{ + int i, ret; + + if (!sp->unsync_children) + return 0; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + u64 ent = sp->spt[i]; + + if (is_shadow_present_pte(ent)) { + struct kvm_mmu_page *child; + child = page_header(ent & PT64_BASE_ADDR_MASK); + + if (child->unsync_children) { + ret = mmu_unsync_walk(child, walker); + if (ret) + return ret; + } + + if (child->unsync) { + ret = walker->entry(child, walker); + if (ret) + return ret; + } + } + } + + if (i == PT64_ENT_PER_PAGE) + sp->unsync_children = 0; + + return 0; +} + static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) { unsigned index; @@ -928,6 +965,59 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) return NULL; } +static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + WARN_ON(!sp->unsync); + sp->unsync = 0; + --kvm->stat.mmu_unsync; +} + +static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); + +static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + if (sp->role.glevels != vcpu->arch.mmu.root_level) { + kvm_mmu_zap_page(vcpu->kvm, sp); + return 1; + } + + rmap_write_protect(vcpu->kvm, sp->gfn); + if (vcpu->arch.mmu.sync_page(vcpu, sp)) { + kvm_mmu_zap_page(vcpu->kvm, sp); + return 1; + } + + kvm_mmu_flush_tlb(vcpu); + kvm_unlink_unsync_page(vcpu->kvm, sp); + return 0; +} + +struct sync_walker { + struct kvm_vcpu *vcpu; + struct kvm_unsync_walk walker; +}; + +static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk) +{ + struct sync_walker *sync_walk = container_of(walk, struct sync_walker, + walker); + struct kvm_vcpu *vcpu = sync_walk->vcpu; + + kvm_sync_page(vcpu, sp); + return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)); +} + +static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + struct sync_walker walker = { + .walker = { .entry = mmu_sync_fn, }, + .vcpu = vcpu, + }; + + while (mmu_unsync_walk(sp, &walker.walker)) + cond_resched_lock(&vcpu->kvm->mmu_lock); +} + static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gaddr, @@ -941,7 +1031,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, unsigned quadrant; struct hlist_head *bucket; struct kvm_mmu_page *sp; - struct hlist_node *node; + struct hlist_node *node, *tmp; role.word = 0; role.glevels = vcpu->arch.mmu.root_level; @@ -957,8 +1047,18 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn, role.word); index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; - hlist_for_each_entry(sp, node, bucket, hash_link) - if (sp->gfn == gfn && sp->role.word == role.word) { + hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) + if (sp->gfn == gfn) { + if (sp->unsync) + if (kvm_sync_page(vcpu, sp)) + continue; + + if (sp->role.word != role.word) + continue; + + if (sp->unsync_children) + set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); + mmu_page_add_parent_pte(vcpu, sp, parent_pte); pgprintk("%s: found\n", __func__); return sp; @@ -971,8 +1071,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, sp->gfn = gfn; sp->role = role; hlist_add_head(&sp->hash_link, bucket); - if (!metaphysical) + if (!metaphysical) { rmap_write_protect(vcpu->kvm, gfn); + account_shadowed(vcpu->kvm, gfn); + } if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) vcpu->arch.mmu.prefetch_page(vcpu, sp); else @@ -1078,14 +1180,47 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) } } +struct zap_walker { + struct kvm_unsync_walk walker; + struct kvm *kvm; + int zapped; +}; + +static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk) +{ + struct zap_walker *zap_walk = container_of(walk, struct zap_walker, + walker); + kvm_mmu_zap_page(zap_walk->kvm, sp); + zap_walk->zapped = 1; + return 0; +} + +static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + struct zap_walker walker = { + .walker = { .entry = mmu_zap_fn, }, + .kvm = kvm, + .zapped = 0, + }; + + if (sp->role.level == PT_PAGE_TABLE_LEVEL) + return 0; + mmu_unsync_walk(sp, &walker.walker); + return walker.zapped; +} + static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) { + int ret; ++kvm->stat.mmu_shadow_zapped; + ret = mmu_zap_unsync_children(kvm, sp); kvm_mmu_page_unlink_children(kvm, sp); kvm_mmu_unlink_parents(kvm, sp); kvm_flush_remote_tlbs(kvm); if (!sp->role.invalid && !sp->role.metaphysical) unaccount_shadowed(kvm, sp->gfn); + if (sp->unsync) + kvm_unlink_unsync_page(kvm, sp); if (!sp->root_count) { hlist_del(&sp->hash_link); kvm_mmu_free_page(kvm, sp); @@ -1095,7 +1230,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_reload_remote_mmus(kvm); } kvm_mmu_reset_last_pte_updated(kvm); - return 0; + return ret; } /* @@ -1201,10 +1336,58 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) return page; } +static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + sp->unsync_children = 1; + return 1; +} + +static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + unsigned index; + struct hlist_head *bucket; + struct kvm_mmu_page *s; + struct hlist_node *node, *n; + + index = kvm_page_table_hashfn(sp->gfn); + bucket = &vcpu->kvm->arch.mmu_page_hash[index]; + /* don't unsync if pagetable is shadowed with multiple roles */ + hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { + if (s->gfn != sp->gfn || s->role.metaphysical) + continue; + if (s->role.word != sp->role.word) + return 1; + } + mmu_parent_walk(vcpu, sp, unsync_walk_fn); + ++vcpu->kvm->stat.mmu_unsync; + sp->unsync = 1; + mmu_convert_notrap(sp); + return 0; +} + +static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, + bool can_unsync) +{ + struct kvm_mmu_page *shadow; + + shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); + if (shadow) { + if (shadow->role.level != PT_PAGE_TABLE_LEVEL) + return 1; + if (shadow->unsync) + return 0; + if (can_unsync) + return kvm_unsync_page(vcpu, shadow); + return 1; + } + return 0; +} + static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, unsigned pte_access, int user_fault, int write_fault, int dirty, int largepage, - gfn_t gfn, pfn_t pfn, bool speculative) + gfn_t gfn, pfn_t pfn, bool speculative, + bool can_unsync) { u64 spte; int ret = 0; @@ -1231,7 +1414,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, if ((pte_access & ACC_WRITE_MASK) || (write_fault && !is_write_protection(vcpu) && !user_fault)) { - struct kvm_mmu_page *shadow; if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) { ret = 1; @@ -1241,8 +1423,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, spte |= PT_WRITABLE_MASK; - shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); - if (shadow) { + if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { pgprintk("%s: found shadow page for %lx, marking ro\n", __func__, gfn); ret = 1; @@ -1260,7 +1441,6 @@ set_pte: return ret; } - static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, unsigned pt_access, unsigned pte_access, int user_fault, int write_fault, int dirty, @@ -1298,7 +1478,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, } } if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, - dirty, largepage, gfn, pfn, speculative)) { + dirty, largepage, gfn, pfn, speculative, true)) { if (write_fault) *ptwrite = 1; kvm_x86_ops->tlb_flush(vcpu); @@ -1518,10 +1698,6 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); } -static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) -{ -} - static void mmu_sync_roots(struct kvm_vcpu *vcpu) { int i; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index dc169e8148b..613ec9aa674 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -580,7 +580,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, is_dirty_pte(gpte), 0, gfn, - spte_to_pfn(sp->spt[i]), true); + spte_to_pfn(sp->spt[i]), true, false); } return !nr_present; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index efee85ba07e..1c5864ac083 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -101,6 +101,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "mmu_flooded", VM_STAT(mmu_flooded) }, { "mmu_recycled", VM_STAT(mmu_recycled) }, { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, + { "mmu_unsync", VM_STAT(mmu_unsync) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, { "largepages", VM_STAT(lpages) }, { NULL } @@ -3120,6 +3121,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (vcpu->requests) { if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) __kvm_migrate_timers(vcpu); + if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) + kvm_mmu_sync_roots(vcpu); if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) kvm_x86_ops->tlb_flush(vcpu); if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 8b935cc4c14..7d36fcc0281 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -195,6 +195,8 @@ struct kvm_mmu_page { */ int multimapped; /* More than one parent_pte? */ int root_count; /* Currently serving as active root */ + bool unsync; + bool unsync_children; union { u64 *parent_pte; /* !multimapped */ struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */ @@ -371,6 +373,7 @@ struct kvm_vm_stat { u32 mmu_flooded; u32 mmu_recycled; u32 mmu_cache_miss; + u32 mmu_unsync; u32 remote_tlb_flush; u32 lpages; }; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6252802c3cc..73b7c52b949 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -35,6 +35,7 @@ #define KVM_REQ_TRIPLE_FAULT 4 #define KVM_REQ_PENDING_TIMER 5 #define KVM_REQ_UNHALT 6 +#define KVM_REQ_MMU_SYNC 7 struct kvm_vcpu; extern struct kmem_cache *kvm_vcpu_cache; -- cgit v1.2.3-70-g09d2 From 0074ff63ebc195701062ca46e0d82fcea0fa3a0a Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 23 Sep 2008 13:18:40 -0300 Subject: KVM: MMU: speed up mmu_unsync_walk Cache the unsynced children information in a per-page bitmap. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 72 ++++++++++++++++++++++++++++++++++++++-------- include/asm-x86/kvm_host.h | 1 + 2 files changed, 61 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d88659ae777..cb391d629af 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -891,6 +891,52 @@ static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, } } +static void kvm_mmu_update_unsync_bitmap(u64 *spte) +{ + unsigned int index; + struct kvm_mmu_page *sp = page_header(__pa(spte)); + + index = spte - sp->spt; + __set_bit(index, sp->unsync_child_bitmap); + sp->unsync_children = 1; +} + +static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) +{ + struct kvm_pte_chain *pte_chain; + struct hlist_node *node; + int i; + + if (!sp->parent_pte) + return; + + if (!sp->multimapped) { + kvm_mmu_update_unsync_bitmap(sp->parent_pte); + return; + } + + hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) + for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { + if (!pte_chain->parent_ptes[i]) + break; + kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]); + } +} + +static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + sp->unsync_children = 1; + kvm_mmu_update_parents_unsync(sp); + return 1; +} + +static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp) +{ + mmu_parent_walk(vcpu, sp, unsync_walk_fn); + kvm_mmu_update_parents_unsync(sp); +} + static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { @@ -910,6 +956,11 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { } +#define for_each_unsync_children(bitmap, idx) \ + for (idx = find_first_bit(bitmap, 512); \ + idx < 512; \ + idx = find_next_bit(bitmap, 512, idx+1)) + static int mmu_unsync_walk(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walker) { @@ -918,7 +969,7 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp, if (!sp->unsync_children) return 0; - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + for_each_unsync_children(sp->unsync_child_bitmap, i) { u64 ent = sp->spt[i]; if (is_shadow_present_pte(ent)) { @@ -929,17 +980,19 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp, ret = mmu_unsync_walk(child, walker); if (ret) return ret; + __clear_bit(i, sp->unsync_child_bitmap); } if (child->unsync) { ret = walker->entry(child, walker); + __clear_bit(i, sp->unsync_child_bitmap); if (ret) return ret; } } } - if (i == PT64_ENT_PER_PAGE) + if (find_first_bit(sp->unsync_child_bitmap, 512) == 512) sp->unsync_children = 0; return 0; @@ -1056,10 +1109,11 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (sp->role.word != role.word) continue; - if (sp->unsync_children) - set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); - mmu_page_add_parent_pte(vcpu, sp, parent_pte); + if (sp->unsync_children) { + set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); + kvm_mmu_mark_parents_unsync(vcpu, sp); + } pgprintk("%s: found\n", __func__); return sp; } @@ -1336,12 +1390,6 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) return page; } -static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) -{ - sp->unsync_children = 1; - return 1; -} - static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { unsigned index; @@ -1358,7 +1406,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) if (s->role.word != sp->role.word) return 1; } - mmu_parent_walk(vcpu, sp, unsync_walk_fn); + kvm_mmu_mark_parents_unsync(vcpu, sp); ++vcpu->kvm->stat.mmu_unsync; sp->unsync = 1; mmu_convert_notrap(sp); diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 7d36fcc0281..0992d721c5f 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -201,6 +201,7 @@ struct kvm_mmu_page { u64 *parent_pte; /* !multimapped */ struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */ }; + DECLARE_BITMAP(unsync_child_bitmap, 512); }; struct kvm_pv_mmu_op_buffer { -- cgit v1.2.3-70-g09d2 From 582801a95d2f2ceab841779e1dec0e11dfec44c0 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 23 Sep 2008 13:18:41 -0300 Subject: KVM: MMU: add "oos_shadow" parameter to disable oos Subject says it all. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index cb391d629af..99c239c5c0a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -70,6 +70,9 @@ static int dbg = 0; module_param(dbg, bool, 0644); #endif +static int oos_shadow = 1; +module_param(oos_shadow, bool, 0644); + #ifndef MMU_DEBUG #define ASSERT(x) do { } while (0) #else @@ -1424,7 +1427,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, return 1; if (shadow->unsync) return 0; - if (can_unsync) + if (can_unsync && oos_shadow) return kvm_unsync_page(vcpu, shadow); return 1; } -- cgit v1.2.3-70-g09d2 From e48258009d941891fca35348986b8d280caf31cd Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 24 Sep 2008 20:28:34 -0300 Subject: KVM: PIC: enhance IPI avoidance The PIC code makes little effort to avoid kvm_vcpu_kick(), resulting in unnecessary guest exits in some conditions. For example, if the timer interrupt is routed through the IOAPIC, IRR for IRQ 0 will get set but not cleared, since the APIC is handling the acks. This means that everytime an interrupt < 16 is triggered, the priority logic will find IRQ0 pending and send an IPI to vcpu0 (in case IRQ0 is not masked, which is Linux's case). Introduce a new variable isr_ack to represent the IRQ's for which the guest has been signalled / cleared the ISR. Use it to avoid more than one IPI per trigger-ack cycle, in addition to the avoidance when ISR is set in get_priority(). Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/i8259.c | 17 +++++++++++++++-- arch/x86/kvm/irq.h | 2 ++ arch/x86/kvm/x86.c | 1 + 3 files changed, 18 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 71e3eeeccae..17e41e165f1 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -33,6 +33,14 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq) { s->isr &= ~(1 << irq); + s->isr_ack |= (1 << irq); +} + +void kvm_pic_clear_isr_ack(struct kvm *kvm) +{ + struct kvm_pic *s = pic_irqchip(kvm); + s->pics[0].isr_ack = 0xff; + s->pics[1].isr_ack = 0xff; } /* @@ -213,6 +221,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s) s->irr = 0; s->imr = 0; s->isr = 0; + s->isr_ack = 0xff; s->priority_add = 0; s->irq_base = 0; s->read_reg_select = 0; @@ -444,10 +453,14 @@ static void pic_irq_request(void *opaque, int level) { struct kvm *kvm = opaque; struct kvm_vcpu *vcpu = kvm->vcpus[0]; + struct kvm_pic *s = pic_irqchip(kvm); + int irq = pic_get_irq(&s->pics[0]); - pic_irqchip(kvm)->output = level; - if (vcpu) + s->output = level; + if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { + s->pics[0].isr_ack &= ~(1 << irq); kvm_vcpu_kick(vcpu); + } } struct kvm_pic *kvm_create_pic(struct kvm *kvm) diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 479a3d2d561..4748532fd1d 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -42,6 +42,7 @@ struct kvm_kpic_state { u8 irr; /* interrupt request register */ u8 imr; /* interrupt mask register */ u8 isr; /* interrupt service register */ + u8 isr_ack; /* interrupt ack detection */ u8 priority_add; /* highest irq priority */ u8 irq_base; u8 read_reg_select; @@ -70,6 +71,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm); void kvm_pic_set_irq(void *opaque, int irq, int level); int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); +void kvm_pic_clear_isr_ack(struct kvm *kvm); static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1c5864ac083..4cfdd1b6df0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3963,6 +3963,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, pr_debug("Set back pending irq %d\n", pending_vec); } + kvm_pic_clear_isr_ack(vcpu->kvm); } kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); -- cgit v1.2.3-70-g09d2 From e5fcfc821a467bd0827635db8fd39ab1213987e5 Mon Sep 17 00:00:00 2001 From: Weidong Han Date: Thu, 25 Sep 2008 23:32:02 +0800 Subject: KVM: Device Assignment: Map mmio pages into VT-d page table Assigned device could DMA to mmio pages, so also need to map mmio pages into VT-d page table. Signed-off-by: Weidong Han Signed-off-by: Avi Kivity --- arch/x86/kvm/vtd.c | 29 +++++++++++------------------ include/asm-x86/kvm_host.h | 2 -- virt/kvm/kvm_main.c | 2 +- 3 files changed, 12 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vtd.c b/arch/x86/kvm/vtd.c index 667bf3fb64b..a770874f3a3 100644 --- a/arch/x86/kvm/vtd.c +++ b/arch/x86/kvm/vtd.c @@ -36,37 +36,30 @@ int kvm_iommu_map_pages(struct kvm *kvm, { gfn_t gfn = base_gfn; pfn_t pfn; - int i, r; + int i, r = 0; struct dmar_domain *domain = kvm->arch.intel_iommu_domain; /* check if iommu exists and in use */ if (!domain) return 0; - r = -EINVAL; for (i = 0; i < npages; i++) { /* check if already mapped */ pfn = (pfn_t)intel_iommu_iova_to_pfn(domain, gfn_to_gpa(gfn)); - if (pfn && !is_mmio_pfn(pfn)) + if (pfn) continue; pfn = gfn_to_pfn(kvm, gfn); - if (!is_mmio_pfn(pfn)) { - r = intel_iommu_page_mapping(domain, - gfn_to_gpa(gfn), - pfn_to_hpa(pfn), - PAGE_SIZE, - DMA_PTE_READ | - DMA_PTE_WRITE); - if (r) { - printk(KERN_DEBUG "kvm_iommu_map_pages:" - "iommu failed to map pfn=%lx\n", pfn); - goto unmap_pages; - } - } else { - printk(KERN_DEBUG "kvm_iommu_map_page:" - "invalid pfn=%lx\n", pfn); + r = intel_iommu_page_mapping(domain, + gfn_to_gpa(gfn), + pfn_to_hpa(pfn), + PAGE_SIZE, + DMA_PTE_READ | + DMA_PTE_WRITE); + if (r) { + printk(KERN_ERR "kvm_iommu_map_pages:" + "iommu failed to map pfn=%lx\n", pfn); goto unmap_pages; } gfn++; diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 0992d721c5f..ca6bbc0bd97 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -502,8 +502,6 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, gpa_t addr, unsigned long *ret); -int is_mmio_pfn(pfn_t pfn); - extern bool tdp_enabled; enum emulation_result { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6cf042789ad..98cd916448a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -76,7 +76,7 @@ static inline int valid_vcpu(int n) return likely(n >= 0 && n < KVM_MAX_VCPUS); } -inline int is_mmio_pfn(pfn_t pfn) +static inline int is_mmio_pfn(pfn_t pfn) { if (pfn_valid(pfn)) return PageReserved(pfn_to_page(pfn)); -- cgit v1.2.3-70-g09d2 From 1b10bf31a5de5b76e2e9c2937878a45c5ae2be37 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 30 Sep 2008 10:41:06 +0200 Subject: KVM: x86: Silence various LAPIC-related host kernel messages KVM-x86 dumps a lot of debug messages that have no meaning for normal operation: - INIT de-assertion is ignored - SIPIs are sent and received - APIC writes are unaligned or < 4 byte long (Windows Server 2003 triggers this on SMP) Degrade them to true debug messages, keeping the host kernel log clean for real problems. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/lapic.c | 16 +++++++--------- arch/x86/kvm/x86.c | 4 ++-- 2 files changed, 9 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index fd00f698692..6571926bfd3 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -365,16 +365,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; kvm_vcpu_kick(vcpu); } else { - printk(KERN_DEBUG - "Ignoring de-assert INIT to vcpu %d\n", - vcpu->vcpu_id); + apic_debug("Ignoring de-assert INIT to vcpu %d\n", + vcpu->vcpu_id); } - break; case APIC_DM_STARTUP: - printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n", - vcpu->vcpu_id, vector); + apic_debug("SIPI to vcpu %d vector 0x%02x\n", + vcpu->vcpu_id, vector); if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { vcpu->arch.sipi_vector = vector; vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; @@ -679,9 +677,9 @@ static void apic_mmio_write(struct kvm_io_device *this, * Refer SDM 8.4.1 */ if (len != 4 || alignment) { - if (printk_ratelimit()) - printk(KERN_ERR "apic write: bad size=%d %lx\n", - len, (long)address); + /* Don't shout loud, $infamous_os would cause only noise. */ + apic_debug("apic write: bad size=%d %lx\n", + len, (long)address); return; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4cfdd1b6df0..d6d7123d264 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3224,8 +3224,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int r; if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { - printk("vcpu %d received sipi with vector # %x\n", - vcpu->vcpu_id, vcpu->arch.sipi_vector); + pr_debug("vcpu %d received sipi with vector # %x\n", + vcpu->vcpu_id, vcpu->arch.sipi_vector); kvm_lapic_reset(vcpu); r = kvm_x86_ops->vcpu_reset(vcpu); if (r) -- cgit v1.2.3-70-g09d2 From 83dbc83a0d7c88c919d769177bd1924a46c9c034 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 7 Oct 2008 17:01:27 -0300 Subject: KVM: VMX: enable invlpg exiting if EPT is disabled Manually disabling EPT via module option fails to re-enable INVLPG exiting. Reported-by: Gleb Natapov Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4556cc3715b..2643b430d83 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2118,7 +2118,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) } if (!vm_need_ept()) exec_control |= CPU_BASED_CR3_STORE_EXITING | - CPU_BASED_CR3_LOAD_EXITING; + CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_INVLPG_EXITING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); if (cpu_has_secondary_exec_ctrls()) { -- cgit v1.2.3-70-g09d2 From 371c01b28e4049d1fbf60a9631cdad98f7cae030 Mon Sep 17 00:00:00 2001 From: Zhang xiantao Date: Thu, 11 Sep 2008 13:19:32 +0800 Subject: KVM: Device Assignment: Move vtd.c from arch/x86/kvm/ to virt/kvm/ Preparation for kvm/ia64 VT-d support. Signed-off-by: Zhang xiantao Signed-off-by: Avi Kivity --- arch/x86/kvm/Makefile | 6 +- arch/x86/kvm/vtd.c | 191 -------------------------------------------------- virt/kvm/vtd.c | 191 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 194 insertions(+), 194 deletions(-) delete mode 100644 arch/x86/kvm/vtd.c create mode 100644 virt/kvm/vtd.c (limited to 'arch/x86') diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 3072b17447a..7dce593b9c6 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -7,14 +7,14 @@ common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ ifeq ($(CONFIG_KVM_TRACE),y) common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) endif +ifeq ($(CONFIG_DMAR),y) +common-objs += $(addprefix ../../../virt/kvm/, vtd.o) +endif EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \ i8254.o -ifeq ($(CONFIG_DMAR),y) -kvm-objs += vtd.o -endif obj-$(CONFIG_KVM) += kvm.o kvm-intel-objs = vmx.o obj-$(CONFIG_KVM_INTEL) += kvm-intel.o diff --git a/arch/x86/kvm/vtd.c b/arch/x86/kvm/vtd.c deleted file mode 100644 index a770874f3a3..00000000000 --- a/arch/x86/kvm/vtd.c +++ /dev/null @@ -1,191 +0,0 @@ -/* - * Copyright (c) 2006, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. - * - * Copyright (C) 2006-2008 Intel Corporation - * Copyright IBM Corporation, 2008 - * Author: Allen M. Kay - * Author: Weidong Han - * Author: Ben-Ami Yassour - */ - -#include -#include -#include -#include -#include - -static int kvm_iommu_unmap_memslots(struct kvm *kvm); -static void kvm_iommu_put_pages(struct kvm *kvm, - gfn_t base_gfn, unsigned long npages); - -int kvm_iommu_map_pages(struct kvm *kvm, - gfn_t base_gfn, unsigned long npages) -{ - gfn_t gfn = base_gfn; - pfn_t pfn; - int i, r = 0; - struct dmar_domain *domain = kvm->arch.intel_iommu_domain; - - /* check if iommu exists and in use */ - if (!domain) - return 0; - - for (i = 0; i < npages; i++) { - /* check if already mapped */ - pfn = (pfn_t)intel_iommu_iova_to_pfn(domain, - gfn_to_gpa(gfn)); - if (pfn) - continue; - - pfn = gfn_to_pfn(kvm, gfn); - r = intel_iommu_page_mapping(domain, - gfn_to_gpa(gfn), - pfn_to_hpa(pfn), - PAGE_SIZE, - DMA_PTE_READ | - DMA_PTE_WRITE); - if (r) { - printk(KERN_ERR "kvm_iommu_map_pages:" - "iommu failed to map pfn=%lx\n", pfn); - goto unmap_pages; - } - gfn++; - } - return 0; - -unmap_pages: - kvm_iommu_put_pages(kvm, base_gfn, i); - return r; -} - -static int kvm_iommu_map_memslots(struct kvm *kvm) -{ - int i, r; - - down_read(&kvm->slots_lock); - for (i = 0; i < kvm->nmemslots; i++) { - r = kvm_iommu_map_pages(kvm, kvm->memslots[i].base_gfn, - kvm->memslots[i].npages); - if (r) - break; - } - up_read(&kvm->slots_lock); - return r; -} - -int kvm_iommu_map_guest(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev) -{ - struct pci_dev *pdev = NULL; - int r; - - if (!intel_iommu_found()) { - printk(KERN_ERR "%s: intel iommu not found\n", __func__); - return -ENODEV; - } - - printk(KERN_DEBUG "VT-d direct map: host bdf = %x:%x:%x\n", - assigned_dev->host_busnr, - PCI_SLOT(assigned_dev->host_devfn), - PCI_FUNC(assigned_dev->host_devfn)); - - pdev = assigned_dev->dev; - - if (pdev == NULL) { - if (kvm->arch.intel_iommu_domain) { - intel_iommu_domain_exit(kvm->arch.intel_iommu_domain); - kvm->arch.intel_iommu_domain = NULL; - } - return -ENODEV; - } - - kvm->arch.intel_iommu_domain = intel_iommu_domain_alloc(pdev); - if (!kvm->arch.intel_iommu_domain) - return -ENODEV; - - r = kvm_iommu_map_memslots(kvm); - if (r) - goto out_unmap; - - intel_iommu_detach_dev(kvm->arch.intel_iommu_domain, - pdev->bus->number, pdev->devfn); - - r = intel_iommu_context_mapping(kvm->arch.intel_iommu_domain, - pdev); - if (r) { - printk(KERN_ERR "Domain context map for %s failed", - pci_name(pdev)); - goto out_unmap; - } - return 0; - -out_unmap: - kvm_iommu_unmap_memslots(kvm); - return r; -} - -static void kvm_iommu_put_pages(struct kvm *kvm, - gfn_t base_gfn, unsigned long npages) -{ - gfn_t gfn = base_gfn; - pfn_t pfn; - struct dmar_domain *domain = kvm->arch.intel_iommu_domain; - int i; - - for (i = 0; i < npages; i++) { - pfn = (pfn_t)intel_iommu_iova_to_pfn(domain, - gfn_to_gpa(gfn)); - kvm_release_pfn_clean(pfn); - gfn++; - } -} - -static int kvm_iommu_unmap_memslots(struct kvm *kvm) -{ - int i; - down_read(&kvm->slots_lock); - for (i = 0; i < kvm->nmemslots; i++) { - kvm_iommu_put_pages(kvm, kvm->memslots[i].base_gfn, - kvm->memslots[i].npages); - } - up_read(&kvm->slots_lock); - - return 0; -} - -int kvm_iommu_unmap_guest(struct kvm *kvm) -{ - struct kvm_assigned_dev_kernel *entry; - struct dmar_domain *domain = kvm->arch.intel_iommu_domain; - - /* check if iommu exists and in use */ - if (!domain) - return 0; - - list_for_each_entry(entry, &kvm->arch.assigned_dev_head, list) { - printk(KERN_DEBUG "VT-d unmap: host bdf = %x:%x:%x\n", - entry->host_busnr, - PCI_SLOT(entry->host_devfn), - PCI_FUNC(entry->host_devfn)); - - /* detach kvm dmar domain */ - intel_iommu_detach_dev(domain, entry->host_busnr, - entry->host_devfn); - } - kvm_iommu_unmap_memslots(kvm); - intel_iommu_domain_exit(domain); - return 0; -} diff --git a/virt/kvm/vtd.c b/virt/kvm/vtd.c new file mode 100644 index 00000000000..a770874f3a3 --- /dev/null +++ b/virt/kvm/vtd.c @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) 2006-2008 Intel Corporation + * Copyright IBM Corporation, 2008 + * Author: Allen M. Kay + * Author: Weidong Han + * Author: Ben-Ami Yassour + */ + +#include +#include +#include +#include +#include + +static int kvm_iommu_unmap_memslots(struct kvm *kvm); +static void kvm_iommu_put_pages(struct kvm *kvm, + gfn_t base_gfn, unsigned long npages); + +int kvm_iommu_map_pages(struct kvm *kvm, + gfn_t base_gfn, unsigned long npages) +{ + gfn_t gfn = base_gfn; + pfn_t pfn; + int i, r = 0; + struct dmar_domain *domain = kvm->arch.intel_iommu_domain; + + /* check if iommu exists and in use */ + if (!domain) + return 0; + + for (i = 0; i < npages; i++) { + /* check if already mapped */ + pfn = (pfn_t)intel_iommu_iova_to_pfn(domain, + gfn_to_gpa(gfn)); + if (pfn) + continue; + + pfn = gfn_to_pfn(kvm, gfn); + r = intel_iommu_page_mapping(domain, + gfn_to_gpa(gfn), + pfn_to_hpa(pfn), + PAGE_SIZE, + DMA_PTE_READ | + DMA_PTE_WRITE); + if (r) { + printk(KERN_ERR "kvm_iommu_map_pages:" + "iommu failed to map pfn=%lx\n", pfn); + goto unmap_pages; + } + gfn++; + } + return 0; + +unmap_pages: + kvm_iommu_put_pages(kvm, base_gfn, i); + return r; +} + +static int kvm_iommu_map_memslots(struct kvm *kvm) +{ + int i, r; + + down_read(&kvm->slots_lock); + for (i = 0; i < kvm->nmemslots; i++) { + r = kvm_iommu_map_pages(kvm, kvm->memslots[i].base_gfn, + kvm->memslots[i].npages); + if (r) + break; + } + up_read(&kvm->slots_lock); + return r; +} + +int kvm_iommu_map_guest(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev) +{ + struct pci_dev *pdev = NULL; + int r; + + if (!intel_iommu_found()) { + printk(KERN_ERR "%s: intel iommu not found\n", __func__); + return -ENODEV; + } + + printk(KERN_DEBUG "VT-d direct map: host bdf = %x:%x:%x\n", + assigned_dev->host_busnr, + PCI_SLOT(assigned_dev->host_devfn), + PCI_FUNC(assigned_dev->host_devfn)); + + pdev = assigned_dev->dev; + + if (pdev == NULL) { + if (kvm->arch.intel_iommu_domain) { + intel_iommu_domain_exit(kvm->arch.intel_iommu_domain); + kvm->arch.intel_iommu_domain = NULL; + } + return -ENODEV; + } + + kvm->arch.intel_iommu_domain = intel_iommu_domain_alloc(pdev); + if (!kvm->arch.intel_iommu_domain) + return -ENODEV; + + r = kvm_iommu_map_memslots(kvm); + if (r) + goto out_unmap; + + intel_iommu_detach_dev(kvm->arch.intel_iommu_domain, + pdev->bus->number, pdev->devfn); + + r = intel_iommu_context_mapping(kvm->arch.intel_iommu_domain, + pdev); + if (r) { + printk(KERN_ERR "Domain context map for %s failed", + pci_name(pdev)); + goto out_unmap; + } + return 0; + +out_unmap: + kvm_iommu_unmap_memslots(kvm); + return r; +} + +static void kvm_iommu_put_pages(struct kvm *kvm, + gfn_t base_gfn, unsigned long npages) +{ + gfn_t gfn = base_gfn; + pfn_t pfn; + struct dmar_domain *domain = kvm->arch.intel_iommu_domain; + int i; + + for (i = 0; i < npages; i++) { + pfn = (pfn_t)intel_iommu_iova_to_pfn(domain, + gfn_to_gpa(gfn)); + kvm_release_pfn_clean(pfn); + gfn++; + } +} + +static int kvm_iommu_unmap_memslots(struct kvm *kvm) +{ + int i; + down_read(&kvm->slots_lock); + for (i = 0; i < kvm->nmemslots; i++) { + kvm_iommu_put_pages(kvm, kvm->memslots[i].base_gfn, + kvm->memslots[i].npages); + } + up_read(&kvm->slots_lock); + + return 0; +} + +int kvm_iommu_unmap_guest(struct kvm *kvm) +{ + struct kvm_assigned_dev_kernel *entry; + struct dmar_domain *domain = kvm->arch.intel_iommu_domain; + + /* check if iommu exists and in use */ + if (!domain) + return 0; + + list_for_each_entry(entry, &kvm->arch.assigned_dev_head, list) { + printk(KERN_DEBUG "VT-d unmap: host bdf = %x:%x:%x\n", + entry->host_busnr, + PCI_SLOT(entry->host_devfn), + PCI_FUNC(entry->host_devfn)); + + /* detach kvm dmar domain */ + intel_iommu_detach_dev(domain, entry->host_busnr, + entry->host_devfn); + } + kvm_iommu_unmap_memslots(kvm); + intel_iommu_domain_exit(domain); + return 0; +} -- cgit v1.2.3-70-g09d2 From 8a98f6648a2b0756d8f26d6c13332f5526355fec Mon Sep 17 00:00:00 2001 From: Xiantao Zhang Date: Mon, 6 Oct 2008 13:47:38 +0800 Subject: KVM: Move device assignment logic to common code To share with other archs, this patch moves device assignment logic to common parts. Signed-off-by: Xiantao Zhang Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 255 -------------------------------------------- include/linux/kvm.h | 2 + include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 268 ++++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 269 insertions(+), 257 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d6d7123d264..f8bde01ba8e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include @@ -107,238 +106,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { NULL } }; -static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, - int assigned_dev_id) -{ - struct list_head *ptr; - struct kvm_assigned_dev_kernel *match; - - list_for_each(ptr, head) { - match = list_entry(ptr, struct kvm_assigned_dev_kernel, list); - if (match->assigned_dev_id == assigned_dev_id) - return match; - } - return NULL; -} - -static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) -{ - struct kvm_assigned_dev_kernel *assigned_dev; - - assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, - interrupt_work); - - /* This is taken to safely inject irq inside the guest. When - * the interrupt injection (or the ioapic code) uses a - * finer-grained lock, update this - */ - mutex_lock(&assigned_dev->kvm->lock); - kvm_set_irq(assigned_dev->kvm, - assigned_dev->guest_irq, 1); - mutex_unlock(&assigned_dev->kvm->lock); - kvm_put_kvm(assigned_dev->kvm); -} - -/* FIXME: Implement the OR logic needed to make shared interrupts on - * this line behave properly - */ -static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = - (struct kvm_assigned_dev_kernel *) dev_id; - - kvm_get_kvm(assigned_dev->kvm); - schedule_work(&assigned_dev->interrupt_work); - disable_irq_nosync(irq); - return IRQ_HANDLED; -} - -/* Ack the irq line for an assigned device */ -static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) -{ - struct kvm_assigned_dev_kernel *dev; - - if (kian->gsi == -1) - return; - - dev = container_of(kian, struct kvm_assigned_dev_kernel, - ack_notifier); - kvm_set_irq(dev->kvm, dev->guest_irq, 0); - enable_irq(dev->host_irq); -} - -static void kvm_free_assigned_device(struct kvm *kvm, - struct kvm_assigned_dev_kernel - *assigned_dev) -{ - if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested) - free_irq(assigned_dev->host_irq, (void *)assigned_dev); - - kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier); - - if (cancel_work_sync(&assigned_dev->interrupt_work)) - /* We had pending work. That means we will have to take - * care of kvm_put_kvm. - */ - kvm_put_kvm(kvm); - - pci_release_regions(assigned_dev->dev); - pci_disable_device(assigned_dev->dev); - pci_dev_put(assigned_dev->dev); - - list_del(&assigned_dev->list); - kfree(assigned_dev); -} - -static void kvm_free_all_assigned_devices(struct kvm *kvm) -{ - struct list_head *ptr, *ptr2; - struct kvm_assigned_dev_kernel *assigned_dev; - - list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) { - assigned_dev = list_entry(ptr, - struct kvm_assigned_dev_kernel, - list); - - kvm_free_assigned_device(kvm, assigned_dev); - } -} - -static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, - struct kvm_assigned_irq - *assigned_irq) -{ - int r = 0; - struct kvm_assigned_dev_kernel *match; - - mutex_lock(&kvm->lock); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_irq->assigned_dev_id); - if (!match) { - mutex_unlock(&kvm->lock); - return -EINVAL; - } - - if (match->irq_requested) { - match->guest_irq = assigned_irq->guest_irq; - match->ack_notifier.gsi = assigned_irq->guest_irq; - mutex_unlock(&kvm->lock); - return 0; - } - - INIT_WORK(&match->interrupt_work, - kvm_assigned_dev_interrupt_work_handler); - - if (irqchip_in_kernel(kvm)) { - if (!capable(CAP_SYS_RAWIO)) { - r = -EPERM; - goto out_release; - } - - if (assigned_irq->host_irq) - match->host_irq = assigned_irq->host_irq; - else - match->host_irq = match->dev->irq; - match->guest_irq = assigned_irq->guest_irq; - match->ack_notifier.gsi = assigned_irq->guest_irq; - match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; - kvm_register_irq_ack_notifier(kvm, &match->ack_notifier); - - /* Even though this is PCI, we don't want to use shared - * interrupts. Sharing host devices with guest-assigned devices - * on the same interrupt line is not a happy situation: there - * are going to be long delays in accepting, acking, etc. - */ - if (request_irq(match->host_irq, kvm_assigned_dev_intr, 0, - "kvm_assigned_device", (void *)match)) { - r = -EIO; - goto out_release; - } - } - - match->irq_requested = true; - mutex_unlock(&kvm->lock); - return r; -out_release: - mutex_unlock(&kvm->lock); - kvm_free_assigned_device(kvm, match); - return r; -} - -static int kvm_vm_ioctl_assign_device(struct kvm *kvm, - struct kvm_assigned_pci_dev *assigned_dev) -{ - int r = 0; - struct kvm_assigned_dev_kernel *match; - struct pci_dev *dev; - - mutex_lock(&kvm->lock); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_dev->assigned_dev_id); - if (match) { - /* device already assigned */ - r = -EINVAL; - goto out; - } - - match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL); - if (match == NULL) { - printk(KERN_INFO "%s: Couldn't allocate memory\n", - __func__); - r = -ENOMEM; - goto out; - } - dev = pci_get_bus_and_slot(assigned_dev->busnr, - assigned_dev->devfn); - if (!dev) { - printk(KERN_INFO "%s: host device not found\n", __func__); - r = -EINVAL; - goto out_free; - } - if (pci_enable_device(dev)) { - printk(KERN_INFO "%s: Could not enable PCI device\n", __func__); - r = -EBUSY; - goto out_put; - } - r = pci_request_regions(dev, "kvm_assigned_device"); - if (r) { - printk(KERN_INFO "%s: Could not get access to device regions\n", - __func__); - goto out_disable; - } - match->assigned_dev_id = assigned_dev->assigned_dev_id; - match->host_busnr = assigned_dev->busnr; - match->host_devfn = assigned_dev->devfn; - match->dev = dev; - - match->kvm = kvm; - - list_add(&match->list, &kvm->arch.assigned_dev_head); - - if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) { - r = kvm_iommu_map_guest(kvm, match); - if (r) - goto out_list_del; - } - -out: - mutex_unlock(&kvm->lock); - return r; -out_list_del: - list_del(&match->list); - pci_release_regions(dev); -out_disable: - pci_disable_device(dev); -out_put: - pci_dev_put(dev); -out_free: - kfree(match); - mutex_unlock(&kvm->lock); - return r; -} - unsigned long segment_base(u16 selector) { struct descriptor_table gdt; @@ -2030,28 +1797,6 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; break; } - case KVM_ASSIGN_PCI_DEVICE: { - struct kvm_assigned_pci_dev assigned_dev; - - r = -EFAULT; - if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) - goto out; - r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev); - if (r) - goto out; - break; - } - case KVM_ASSIGN_IRQ: { - struct kvm_assigned_irq assigned_irq; - - r = -EFAULT; - if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) - goto out; - r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq); - if (r) - goto out; - break; - } case KVM_GET_PIT: { r = -EFAULT; if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state))) diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 4269be171fa..9acf34a6dfb 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -383,7 +383,9 @@ struct kvm_trace_rec { #define KVM_CAP_MP_STATE 14 #define KVM_CAP_COALESCED_MMIO 15 #define KVM_CAP_SYNC_MMU 16 /* Changes to host mmap are reflected in guest */ +#ifdef CONFIG_X86 #define KVM_CAP_DEVICE_ASSIGNMENT 17 +#endif #define KVM_CAP_IOMMU 18 /* diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 73b7c52b949..10c1146cd00 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -281,6 +281,7 @@ void kvm_free_physmem(struct kvm *kvm); struct kvm *kvm_arch_create_vm(void); void kvm_arch_destroy_vm(struct kvm *kvm); +void kvm_free_all_assigned_devices(struct kvm *kvm); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *v); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 98cd916448a..485bcdc1655 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -51,6 +51,12 @@ #include "coalesced_mmio.h" #endif +#ifdef KVM_CAP_DEVICE_ASSIGNMENT +#include +#include +#include "irq.h" +#endif + MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); @@ -71,6 +77,240 @@ static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, bool kvm_rebooting; +#ifdef KVM_CAP_DEVICE_ASSIGNMENT +static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, + int assigned_dev_id) +{ + struct list_head *ptr; + struct kvm_assigned_dev_kernel *match; + + list_for_each(ptr, head) { + match = list_entry(ptr, struct kvm_assigned_dev_kernel, list); + if (match->assigned_dev_id == assigned_dev_id) + return match; + } + return NULL; +} + +static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) +{ + struct kvm_assigned_dev_kernel *assigned_dev; + + assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, + interrupt_work); + + /* This is taken to safely inject irq inside the guest. When + * the interrupt injection (or the ioapic code) uses a + * finer-grained lock, update this + */ + mutex_lock(&assigned_dev->kvm->lock); + kvm_set_irq(assigned_dev->kvm, + assigned_dev->guest_irq, 1); + mutex_unlock(&assigned_dev->kvm->lock); + kvm_put_kvm(assigned_dev->kvm); +} + +/* FIXME: Implement the OR logic needed to make shared interrupts on + * this line behave properly + */ +static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) +{ + struct kvm_assigned_dev_kernel *assigned_dev = + (struct kvm_assigned_dev_kernel *) dev_id; + + kvm_get_kvm(assigned_dev->kvm); + schedule_work(&assigned_dev->interrupt_work); + disable_irq_nosync(irq); + return IRQ_HANDLED; +} + +/* Ack the irq line for an assigned device */ +static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) +{ + struct kvm_assigned_dev_kernel *dev; + + if (kian->gsi == -1) + return; + + dev = container_of(kian, struct kvm_assigned_dev_kernel, + ack_notifier); + kvm_set_irq(dev->kvm, dev->guest_irq, 0); + enable_irq(dev->host_irq); +} + +static void kvm_free_assigned_device(struct kvm *kvm, + struct kvm_assigned_dev_kernel + *assigned_dev) +{ + if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested) + free_irq(assigned_dev->host_irq, (void *)assigned_dev); + + kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier); + + if (cancel_work_sync(&assigned_dev->interrupt_work)) + /* We had pending work. That means we will have to take + * care of kvm_put_kvm. + */ + kvm_put_kvm(kvm); + + pci_release_regions(assigned_dev->dev); + pci_disable_device(assigned_dev->dev); + pci_dev_put(assigned_dev->dev); + + list_del(&assigned_dev->list); + kfree(assigned_dev); +} + +void kvm_free_all_assigned_devices(struct kvm *kvm) +{ + struct list_head *ptr, *ptr2; + struct kvm_assigned_dev_kernel *assigned_dev; + + list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) { + assigned_dev = list_entry(ptr, + struct kvm_assigned_dev_kernel, + list); + + kvm_free_assigned_device(kvm, assigned_dev); + } +} + +static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, + struct kvm_assigned_irq + *assigned_irq) +{ + int r = 0; + struct kvm_assigned_dev_kernel *match; + + mutex_lock(&kvm->lock); + + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_irq->assigned_dev_id); + if (!match) { + mutex_unlock(&kvm->lock); + return -EINVAL; + } + + if (match->irq_requested) { + match->guest_irq = assigned_irq->guest_irq; + match->ack_notifier.gsi = assigned_irq->guest_irq; + mutex_unlock(&kvm->lock); + return 0; + } + + INIT_WORK(&match->interrupt_work, + kvm_assigned_dev_interrupt_work_handler); + + if (irqchip_in_kernel(kvm)) { + if (!capable(CAP_SYS_RAWIO)) { + r = -EPERM; + goto out_release; + } + + if (assigned_irq->host_irq) + match->host_irq = assigned_irq->host_irq; + else + match->host_irq = match->dev->irq; + match->guest_irq = assigned_irq->guest_irq; + match->ack_notifier.gsi = assigned_irq->guest_irq; + match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; + kvm_register_irq_ack_notifier(kvm, &match->ack_notifier); + + /* Even though this is PCI, we don't want to use shared + * interrupts. Sharing host devices with guest-assigned devices + * on the same interrupt line is not a happy situation: there + * are going to be long delays in accepting, acking, etc. + */ + if (request_irq(match->host_irq, kvm_assigned_dev_intr, 0, + "kvm_assigned_device", (void *)match)) { + r = -EIO; + goto out_release; + } + } + + match->irq_requested = true; + mutex_unlock(&kvm->lock); + return r; +out_release: + mutex_unlock(&kvm->lock); + kvm_free_assigned_device(kvm, match); + return r; +} + +static int kvm_vm_ioctl_assign_device(struct kvm *kvm, + struct kvm_assigned_pci_dev *assigned_dev) +{ + int r = 0; + struct kvm_assigned_dev_kernel *match; + struct pci_dev *dev; + + mutex_lock(&kvm->lock); + + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_dev->assigned_dev_id); + if (match) { + /* device already assigned */ + r = -EINVAL; + goto out; + } + + match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL); + if (match == NULL) { + printk(KERN_INFO "%s: Couldn't allocate memory\n", + __func__); + r = -ENOMEM; + goto out; + } + dev = pci_get_bus_and_slot(assigned_dev->busnr, + assigned_dev->devfn); + if (!dev) { + printk(KERN_INFO "%s: host device not found\n", __func__); + r = -EINVAL; + goto out_free; + } + if (pci_enable_device(dev)) { + printk(KERN_INFO "%s: Could not enable PCI device\n", __func__); + r = -EBUSY; + goto out_put; + } + r = pci_request_regions(dev, "kvm_assigned_device"); + if (r) { + printk(KERN_INFO "%s: Could not get access to device regions\n", + __func__); + goto out_disable; + } + match->assigned_dev_id = assigned_dev->assigned_dev_id; + match->host_busnr = assigned_dev->busnr; + match->host_devfn = assigned_dev->devfn; + match->dev = dev; + + match->kvm = kvm; + + list_add(&match->list, &kvm->arch.assigned_dev_head); + + if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) { + r = kvm_iommu_map_guest(kvm, match); + if (r) + goto out_list_del; + } + +out: + mutex_unlock(&kvm->lock); + return r; +out_list_del: + list_del(&match->list); + pci_release_regions(dev); +out_disable: + pci_disable_device(dev); +out_put: + pci_dev_put(dev); +out_free: + kfree(match); + mutex_unlock(&kvm->lock); + return r; +} +#endif + static inline int valid_vcpu(int n) { return likely(n >= 0 && n < KVM_MAX_VCPUS); @@ -578,12 +818,12 @@ int __kvm_set_memory_region(struct kvm *kvm, } kvm_free_physmem_slot(&old, &new); - +#ifdef CONFIG_DMAR /* map the pages in iommu page table */ r = kvm_iommu_map_pages(kvm, base_gfn, npages); if (r) goto out; - +#endif return 0; out_free: @@ -1382,6 +1622,30 @@ static long kvm_vm_ioctl(struct file *filp, r = 0; break; } +#endif +#ifdef KVM_CAP_DEVICE_ASSIGNMENT + case KVM_ASSIGN_PCI_DEVICE: { + struct kvm_assigned_pci_dev assigned_dev; + + r = -EFAULT; + if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) + goto out; + r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev); + if (r) + goto out; + break; + } + case KVM_ASSIGN_IRQ: { + struct kvm_assigned_irq assigned_irq; + + r = -EFAULT; + if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) + goto out; + r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq); + if (r) + goto out; + break; + } #endif default: r = kvm_arch_vm_ioctl(filp, ioctl, arg); -- cgit v1.2.3-70-g09d2 From 3de42dc094ecd313dc7d551e007a134b52f8663d Mon Sep 17 00:00:00 2001 From: Xiantao Zhang Date: Mon, 6 Oct 2008 13:48:45 +0800 Subject: KVM: Separate irq ack notification out of arch/x86/kvm/irq.c Moving irq ack notification logic as common, and make it shared with ia64 side. Signed-off-by: Xiantao Zhang Signed-off-by: Avi Kivity --- arch/ia64/include/asm/kvm_host.h | 4 +++ arch/ia64/kvm/Makefile | 2 +- arch/ia64/kvm/irq.h | 5 ---- arch/x86/kvm/Makefile | 2 +- arch/x86/kvm/irq.c | 33 ---------------------- arch/x86/kvm/irq.h | 8 ------ include/asm-x86/kvm_host.h | 2 ++ include/linux/kvm_host.h | 6 ++++ virt/kvm/irq_comm.c | 60 ++++++++++++++++++++++++++++++++++++++++ 9 files changed, 74 insertions(+), 48 deletions(-) create mode 100644 virt/kvm/irq_comm.c (limited to 'arch/x86') diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h index 1efe513a994..da579a33db1 100644 --- a/arch/ia64/include/asm/kvm_host.h +++ b/arch/ia64/include/asm/kvm_host.h @@ -413,6 +413,10 @@ struct kvm_arch { struct kvm_ioapic *vioapic; struct kvm_vm_stat stat; struct kvm_sal_data rdv_sal_data; + + struct list_head assigned_dev_head; + struct dmar_domain *intel_iommu_domain; + struct hlist_head irq_ack_notifier_list; }; union cpuid3_t { diff --git a/arch/ia64/kvm/Makefile b/arch/ia64/kvm/Makefile index bf22fb9e6dc..3b1a1c156ef 100644 --- a/arch/ia64/kvm/Makefile +++ b/arch/ia64/kvm/Makefile @@ -44,7 +44,7 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/ia64/kvm/ EXTRA_AFLAGS += -Ivirt/kvm -Iarch/ia64/kvm/ common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ - coalesced_mmio.o) + coalesced_mmio.o irq_comm.o) kvm-objs := $(common-objs) kvm-ia64.o kvm_fw.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/ia64/kvm/irq.h b/arch/ia64/kvm/irq.h index f2e6545debf..604329ac3c9 100644 --- a/arch/ia64/kvm/irq.h +++ b/arch/ia64/kvm/irq.h @@ -23,10 +23,5 @@ #ifndef __IRQ_H #define __IRQ_H -struct kvm; - -static inline void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi) -{ -} #endif diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 7dce593b9c6..c02343594b4 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -3,7 +3,7 @@ # common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ - coalesced_mmio.o) + coalesced_mmio.o irq_comm.o) ifeq ($(CONFIG_KVM_TRACE),y) common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) endif diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 8c1b9c5def7..c019b8edcdb 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -99,36 +99,3 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu) __kvm_migrate_apic_timer(vcpu); __kvm_migrate_pit_timer(vcpu); } - -/* This should be called with the kvm->lock mutex held */ -void kvm_set_irq(struct kvm *kvm, int irq, int level) -{ - /* Not possible to detect if the guest uses the PIC or the - * IOAPIC. So set the bit in both. The guest will ignore - * writes to the unused one. - */ - kvm_ioapic_set_irq(kvm->arch.vioapic, irq, level); - kvm_pic_set_irq(pic_irqchip(kvm), irq, level); -} - -void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi) -{ - struct kvm_irq_ack_notifier *kian; - struct hlist_node *n; - - hlist_for_each_entry(kian, n, &kvm->arch.irq_ack_notifier_list, link) - if (kian->gsi == gsi) - kian->irq_acked(kian); -} - -void kvm_register_irq_ack_notifier(struct kvm *kvm, - struct kvm_irq_ack_notifier *kian) -{ - hlist_add_head(&kian->link, &kvm->arch.irq_ack_notifier_list); -} - -void kvm_unregister_irq_ack_notifier(struct kvm *kvm, - struct kvm_irq_ack_notifier *kian) -{ - hlist_del(&kian->link); -} diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 4748532fd1d..f17c8f5bbf3 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -68,7 +68,6 @@ struct kvm_pic { }; struct kvm_pic *kvm_create_pic(struct kvm *kvm); -void kvm_pic_set_irq(void *opaque, int irq, int level); int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); void kvm_pic_clear_isr_ack(struct kvm *kvm); @@ -85,13 +84,6 @@ static inline int irqchip_in_kernel(struct kvm *kvm) void kvm_pic_reset(struct kvm_kpic_state *s); -void kvm_set_irq(struct kvm *kvm, int irq, int level); -void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi); -void kvm_register_irq_ack_notifier(struct kvm *kvm, - struct kvm_irq_ack_notifier *kian); -void kvm_unregister_irq_ack_notifier(struct kvm *kvm, - struct kvm_irq_ack_notifier *kian); - void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index ca6bbc0bd97..411fb8cfb24 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -565,6 +565,8 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, u32 error_code); +void kvm_pic_set_irq(void *opaque, int irq, int level); + void kvm_inject_nmi(struct kvm_vcpu *vcpu); void fx_init(struct kvm_vcpu *vcpu); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b3b7598b4d9..3833c48fae3 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -309,6 +309,12 @@ struct kvm_assigned_dev_kernel { struct pci_dev *dev; struct kvm *kvm; }; +void kvm_set_irq(struct kvm *kvm, int irq, int level); +void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi); +void kvm_register_irq_ack_notifier(struct kvm *kvm, + struct kvm_irq_ack_notifier *kian); +void kvm_unregister_irq_ack_notifier(struct kvm *kvm, + struct kvm_irq_ack_notifier *kian); #ifdef CONFIG_DMAR int kvm_iommu_map_pages(struct kvm *kvm, gfn_t base_gfn, diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c new file mode 100644 index 00000000000..d0169f5e604 --- /dev/null +++ b/virt/kvm/irq_comm.c @@ -0,0 +1,60 @@ +/* + * irq_comm.c: Common API for in kernel interrupt controller + * Copyright (c) 2007, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * Authors: + * Yaozu (Eddie) Dong + * + */ + +#include +#include "irq.h" + +#include "ioapic.h" + +/* This should be called with the kvm->lock mutex held */ +void kvm_set_irq(struct kvm *kvm, int irq, int level) +{ + /* Not possible to detect if the guest uses the PIC or the + * IOAPIC. So set the bit in both. The guest will ignore + * writes to the unused one. + */ + kvm_ioapic_set_irq(kvm->arch.vioapic, irq, level); +#ifdef CONFIG_X86 + kvm_pic_set_irq(pic_irqchip(kvm), irq, level); +#endif +} + +void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi) +{ + struct kvm_irq_ack_notifier *kian; + struct hlist_node *n; + + hlist_for_each_entry(kian, n, &kvm->arch.irq_ack_notifier_list, link) + if (kian->gsi == gsi) + kian->irq_acked(kian); +} + +void kvm_register_irq_ack_notifier(struct kvm *kvm, + struct kvm_irq_ack_notifier *kian) +{ + hlist_add_head(&kian->link, &kvm->arch.irq_ack_notifier_list); +} + +void kvm_unregister_irq_ack_notifier(struct kvm *kvm, + struct kvm_irq_ack_notifier *kian) +{ + hlist_del(&kian->link); +} -- cgit v1.2.3-70-g09d2 From e2fee2761ad1df2d29b9d502a3cefc87a17b32ca Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 18 Jul 2008 17:36:20 +0200 Subject: OProfile: Rework oprofile_add_ibs_sample() function Code looks much more cleaner now. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 5 +++-- drivers/oprofile/cpu_buffer.c | 48 +++++++++++++--------------------------- 2 files changed, 18 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index d9faf607b3a..54632e0d6bf 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -67,8 +67,9 @@ static unsigned long reset_value[NUM_COUNTERS]; /* The function interface needs to be fixed, something like add data. Should then be added to linux/oprofile.h. */ -extern void oprofile_add_ibs_sample(struct pt_regs *const regs, - unsigned int * const ibs_sample, u8 code); +extern void +oprofile_add_ibs_sample(struct pt_regs *const regs, + unsigned int * const ibs_sample, int ibs_code); struct ibs_fetch_sample { /* MSRC001_1031 IBS Fetch Linear Address Register */ diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c index e1bd5a937f6..d6c68270b3d 100644 --- a/drivers/oprofile/cpu_buffer.c +++ b/drivers/oprofile/cpu_buffer.c @@ -257,21 +257,23 @@ void oprofile_add_sample(struct pt_regs * const regs, unsigned long event) #ifdef CONFIG_OPROFILE_IBS -#define MAX_IBS_SAMPLE_SIZE 14 -static int log_ibs_sample(struct oprofile_cpu_buffer *cpu_buf, - unsigned long pc, int is_kernel, unsigned int *ibs, int ibs_code) +#define MAX_IBS_SAMPLE_SIZE 14 + +void oprofile_add_ibs_sample(struct pt_regs *const regs, + unsigned int * const ibs_sample, int ibs_code) { + int is_kernel = !user_mode(regs); + struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer); struct task_struct *task; cpu_buf->sample_received++; if (nr_available_slots(cpu_buf) < MAX_IBS_SAMPLE_SIZE) { + /* we can't backtrace since we lost the source of this event */ cpu_buf->sample_lost_overflow++; - return 0; + return; } - is_kernel = !!is_kernel; - /* notice a switch from user->kernel or vice versa */ if (cpu_buf->last_is_kernel != is_kernel) { cpu_buf->last_is_kernel = is_kernel; @@ -281,7 +283,6 @@ static int log_ibs_sample(struct oprofile_cpu_buffer *cpu_buf, /* notice a task switch */ if (!is_kernel) { task = current; - if (cpu_buf->last_task != task) { cpu_buf->last_task = task; add_code(cpu_buf, (unsigned long)task); @@ -289,36 +290,17 @@ static int log_ibs_sample(struct oprofile_cpu_buffer *cpu_buf, } add_code(cpu_buf, ibs_code); - add_sample(cpu_buf, ibs[0], ibs[1]); - add_sample(cpu_buf, ibs[2], ibs[3]); - add_sample(cpu_buf, ibs[4], ibs[5]); + add_sample(cpu_buf, ibs_sample[0], ibs_sample[1]); + add_sample(cpu_buf, ibs_sample[2], ibs_sample[3]); + add_sample(cpu_buf, ibs_sample[4], ibs_sample[5]); if (ibs_code == IBS_OP_BEGIN) { - add_sample(cpu_buf, ibs[6], ibs[7]); - add_sample(cpu_buf, ibs[8], ibs[9]); - add_sample(cpu_buf, ibs[10], ibs[11]); - } - - return 1; -} - -void oprofile_add_ibs_sample(struct pt_regs *const regs, - unsigned int * const ibs_sample, u8 code) -{ - int is_kernel = !user_mode(regs); - unsigned long pc = profile_pc(regs); - - struct oprofile_cpu_buffer *cpu_buf = - &per_cpu(cpu_buffer, smp_processor_id()); - - if (!backtrace_depth) { - log_ibs_sample(cpu_buf, pc, is_kernel, ibs_sample, code); - return; + add_sample(cpu_buf, ibs_sample[6], ibs_sample[7]); + add_sample(cpu_buf, ibs_sample[8], ibs_sample[9]); + add_sample(cpu_buf, ibs_sample[10], ibs_sample[11]); } - /* if log_sample() fails we can't backtrace since we lost the source - * of this event */ - if (log_ibs_sample(cpu_buf, pc, is_kernel, ibs_sample, code)) + if (backtrace_depth) oprofile_ops.backtrace(regs, backtrace_depth); } -- cgit v1.2.3-70-g09d2 From 2d55a478827f3eed2ee9701605fdeb9cac2d78dc Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 18 Jul 2008 17:56:05 +0200 Subject: OProfile: Rework string handling in setup_ibs_files() Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 54632e0d6bf..0657c56a66b 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -473,7 +473,6 @@ static int (*create_arch_files)(struct super_block * sb, struct dentry * root); static int setup_ibs_files(struct super_block * sb, struct dentry * root) { - char buf[12]; struct dentry *dir; int ret = 0; @@ -495,22 +494,22 @@ static int setup_ibs_files(struct super_block * sb, struct dentry * root) ibs_config.max_cnt_op = 250000; ibs_config.op_enabled = 0; ibs_config.dispatched_ops = 1; - snprintf(buf, sizeof(buf), "ibs_fetch"); - dir = oprofilefs_mkdir(sb, root, buf); - oprofilefs_create_ulong(sb, dir, "rand_enable", - &ibs_config.rand_en); + + dir = oprofilefs_mkdir(sb, root, "ibs_fetch"); oprofilefs_create_ulong(sb, dir, "enable", - &ibs_config.fetch_enabled); + &ibs_config.fetch_enabled); oprofilefs_create_ulong(sb, dir, "max_count", - &ibs_config.max_cnt_fetch); - snprintf(buf, sizeof(buf), "ibs_uops"); - dir = oprofilefs_mkdir(sb, root, buf); + &ibs_config.max_cnt_fetch); + oprofilefs_create_ulong(sb, dir, "rand_enable", + &ibs_config.rand_en); + + dir = oprofilefs_mkdir(sb, root, "ibs_uops"); oprofilefs_create_ulong(sb, dir, "enable", - &ibs_config.op_enabled); + &ibs_config.op_enabled); oprofilefs_create_ulong(sb, dir, "max_count", - &ibs_config.max_cnt_op); + &ibs_config.max_cnt_op); oprofilefs_create_ulong(sb, dir, "dispatched_ops", - &ibs_config.dispatched_ops); + &ibs_config.dispatched_ops); return 0; } -- cgit v1.2.3-70-g09d2 From ccd755c2d90dd9b9729ba5975f7c92bf206ddcf7 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 29 Jul 2008 16:57:10 +0200 Subject: OProfile: Rename IBS sysfs dir into "ibs_op" The new name is now more close to those used in the spec. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 0657c56a66b..23ce63f2762 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -503,7 +503,7 @@ static int setup_ibs_files(struct super_block * sb, struct dentry * root) oprofilefs_create_ulong(sb, dir, "rand_enable", &ibs_config.rand_en); - dir = oprofilefs_mkdir(sb, root, "ibs_uops"); + dir = oprofilefs_mkdir(sb, root, "ibs_op"); oprofilefs_create_ulong(sb, dir, "enable", &ibs_config.op_enabled); oprofilefs_create_ulong(sb, dir, "max_count", -- cgit v1.2.3-70-g09d2 From c92960fccb9f32a1d6110f6dcfe483ed96c62beb Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 5 Sep 2008 17:12:36 +0200 Subject: oprofile: whitespace fixes Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 20 ++++++++++---------- arch/x86/oprofile/op_model_p4.c | 32 ++++++++++++++++---------------- arch/x86/oprofile/op_model_ppro.c | 16 ++++++++-------- drivers/oprofile/buffer_sync.c | 1 - drivers/oprofile/oprof.c | 24 ++++++++++++------------ 5 files changed, 46 insertions(+), 47 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 23ce63f2762..b9a810b3326 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -530,14 +530,14 @@ static void op_amd_exit(void) #endif struct op_x86_model_spec const op_amd_spec = { - .init = op_amd_init, - .exit = op_amd_exit, - .num_counters = NUM_COUNTERS, - .num_controls = NUM_CONTROLS, - .fill_in_addresses = &op_amd_fill_in_addresses, - .setup_ctrs = &op_amd_setup_ctrs, - .check_ctrs = &op_amd_check_ctrs, - .start = &op_amd_start, - .stop = &op_amd_stop, - .shutdown = &op_amd_shutdown + .init = op_amd_init, + .exit = op_amd_exit, + .num_counters = NUM_COUNTERS, + .num_controls = NUM_CONTROLS, + .fill_in_addresses = &op_amd_fill_in_addresses, + .setup_ctrs = &op_amd_setup_ctrs, + .check_ctrs = &op_amd_check_ctrs, + .start = &op_amd_start, + .stop = &op_amd_stop, + .shutdown = &op_amd_shutdown }; diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 43ac5af338d..4c4a51c90bc 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -698,24 +698,24 @@ static void p4_shutdown(struct op_msrs const * const msrs) #ifdef CONFIG_SMP struct op_x86_model_spec const op_p4_ht2_spec = { - .num_counters = NUM_COUNTERS_HT2, - .num_controls = NUM_CONTROLS_HT2, - .fill_in_addresses = &p4_fill_in_addresses, - .setup_ctrs = &p4_setup_ctrs, - .check_ctrs = &p4_check_ctrs, - .start = &p4_start, - .stop = &p4_stop, - .shutdown = &p4_shutdown + .num_counters = NUM_COUNTERS_HT2, + .num_controls = NUM_CONTROLS_HT2, + .fill_in_addresses = &p4_fill_in_addresses, + .setup_ctrs = &p4_setup_ctrs, + .check_ctrs = &p4_check_ctrs, + .start = &p4_start, + .stop = &p4_stop, + .shutdown = &p4_shutdown }; #endif struct op_x86_model_spec const op_p4_spec = { - .num_counters = NUM_COUNTERS_NON_HT, - .num_controls = NUM_CONTROLS_NON_HT, - .fill_in_addresses = &p4_fill_in_addresses, - .setup_ctrs = &p4_setup_ctrs, - .check_ctrs = &p4_check_ctrs, - .start = &p4_start, - .stop = &p4_stop, - .shutdown = &p4_shutdown + .num_counters = NUM_COUNTERS_NON_HT, + .num_controls = NUM_CONTROLS_NON_HT, + .fill_in_addresses = &p4_fill_in_addresses, + .setup_ctrs = &p4_setup_ctrs, + .check_ctrs = &p4_check_ctrs, + .start = &p4_start, + .stop = &p4_stop, + .shutdown = &p4_shutdown }; diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index eff431f6c57..c665bac4a14 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -181,12 +181,12 @@ static void ppro_shutdown(struct op_msrs const * const msrs) struct op_x86_model_spec const op_ppro_spec = { - .num_counters = NUM_COUNTERS, - .num_controls = NUM_CONTROLS, - .fill_in_addresses = &ppro_fill_in_addresses, - .setup_ctrs = &ppro_setup_ctrs, - .check_ctrs = &ppro_check_ctrs, - .start = &ppro_start, - .stop = &ppro_stop, - .shutdown = &ppro_shutdown + .num_counters = NUM_COUNTERS, + .num_controls = NUM_CONTROLS, + .fill_in_addresses = &ppro_fill_in_addresses, + .setup_ctrs = &ppro_setup_ctrs, + .check_ctrs = &ppro_check_ctrs, + .start = &ppro_start, + .stop = &ppro_stop, + .shutdown = &ppro_shutdown }; diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c index ed982273fb8..564577307a5 100644 --- a/drivers/oprofile/buffer_sync.c +++ b/drivers/oprofile/buffer_sync.c @@ -41,7 +41,6 @@ static cpumask_t marked_cpus = CPU_MASK_NONE; static DEFINE_SPINLOCK(task_mortuary); static void process_task_mortuary(void); - /* Take ownership of the task struct and place it on the * list for processing. Only after two full buffer syncs * does the task eventually get freed, because by then diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c index 2c645170f06..50062cea292 100644 --- a/drivers/oprofile/oprof.c +++ b/drivers/oprofile/oprof.c @@ -19,7 +19,7 @@ #include "cpu_buffer.h" #include "buffer_sync.h" #include "oprofile_stats.h" - + struct oprofile_operations oprofile_ops; unsigned long oprofile_started; @@ -36,7 +36,7 @@ static int timer = 0; int oprofile_setup(void) { int err; - + mutex_lock(&start_mutex); if ((err = alloc_cpu_buffers())) @@ -44,10 +44,10 @@ int oprofile_setup(void) if ((err = alloc_event_buffer())) goto out1; - + if (oprofile_ops.setup && (err = oprofile_ops.setup())) goto out2; - + /* Note even though this starts part of the * profiling overhead, it's necessary to prevent * us missing task deaths and eventually oopsing @@ -74,7 +74,7 @@ post_sync: is_setup = 1; mutex_unlock(&start_mutex); return 0; - + out3: if (oprofile_ops.shutdown) oprofile_ops.shutdown(); @@ -92,17 +92,17 @@ out: int oprofile_start(void) { int err = -EINVAL; - + mutex_lock(&start_mutex); if (!is_setup) goto out; - err = 0; - + err = 0; + if (oprofile_started) goto out; - + oprofile_reset_stats(); if ((err = oprofile_ops.start())) @@ -114,7 +114,7 @@ out: return err; } - + /* echo 0>/dev/oprofile/enable */ void oprofile_stop(void) { @@ -204,13 +204,13 @@ static void __exit oprofile_exit(void) oprofile_arch_exit(); } - + module_init(oprofile_init); module_exit(oprofile_exit); module_param_named(timer, timer, int, 0644); MODULE_PARM_DESC(timer, "force use of timer interrupt"); - + MODULE_LICENSE("GPL"); MODULE_AUTHOR("John Levon "); MODULE_DESCRIPTION("OProfile system profiler"); -- cgit v1.2.3-70-g09d2 From 25ad2913cae9c9e3ed28075caeb2eefccd636f4f Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 5 Sep 2008 17:12:36 +0200 Subject: oprofile: more whitespace fixes Signed-off-by: Robert Richter --- arch/alpha/oprofile/common.c | 2 +- arch/ia64/oprofile/init.c | 4 +- arch/ia64/oprofile/perfmon.c | 4 +- arch/m32r/oprofile/init.c | 2 +- arch/mips/oprofile/common.c | 2 +- arch/mips/oprofile/op_impl.h | 2 +- arch/mips/oprofile/op_model_rm9000.c | 2 +- arch/parisc/oprofile/init.c | 2 +- arch/powerpc/oprofile/op_model_cell.c | 2 +- arch/sparc/oprofile/init.c | 2 +- arch/sparc64/oprofile/init.c | 2 +- arch/x86/oprofile/backtrace.c | 2 +- arch/x86/oprofile/op_model_amd.c | 6 +-- arch/x86/oprofile/op_x86_model.h | 4 +- drivers/oprofile/cpu_buffer.c | 22 +++++------ drivers/oprofile/cpu_buffer.h | 6 +-- drivers/oprofile/event_buffer.c | 10 ++--- drivers/oprofile/oprof.h | 4 +- drivers/oprofile/oprofile_files.c | 16 ++++---- drivers/oprofile/oprofile_stats.c | 10 ++--- drivers/oprofile/oprofile_stats.h | 2 +- drivers/oprofile/oprofilefs.c | 72 +++++++++++++++++------------------ drivers/oprofile/timer_int.c | 2 +- 23 files changed, 91 insertions(+), 91 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/oprofile/common.c b/arch/alpha/oprofile/common.c index 7c3d5ec6ec6..bd8ac533a50 100644 --- a/arch/alpha/oprofile/common.c +++ b/arch/alpha/oprofile/common.c @@ -106,7 +106,7 @@ op_axp_stop(void) } static int -op_axp_create_files(struct super_block * sb, struct dentry * root) +op_axp_create_files(struct super_block *sb, struct dentry *root) { int i; diff --git a/arch/ia64/oprofile/init.c b/arch/ia64/oprofile/init.c index 125a602a660..31b545c3546 100644 --- a/arch/ia64/oprofile/init.c +++ b/arch/ia64/oprofile/init.c @@ -12,11 +12,11 @@ #include #include -extern int perfmon_init(struct oprofile_operations * ops); +extern int perfmon_init(struct oprofile_operations *ops); extern void perfmon_exit(void); extern void ia64_backtrace(struct pt_regs * const regs, unsigned int depth); -int __init oprofile_arch_init(struct oprofile_operations * ops) +int __init oprofile_arch_init(struct oprofile_operations *ops) { int ret = -ENODEV; diff --git a/arch/ia64/oprofile/perfmon.c b/arch/ia64/oprofile/perfmon.c index bc41dd32fec..192d3e8e1f6 100644 --- a/arch/ia64/oprofile/perfmon.c +++ b/arch/ia64/oprofile/perfmon.c @@ -56,7 +56,7 @@ static pfm_buffer_fmt_t oprofile_fmt = { }; -static char * get_cpu_type(void) +static char *get_cpu_type(void) { __u8 family = local_cpu_data->family; @@ -75,7 +75,7 @@ static char * get_cpu_type(void) static int using_perfmon; -int perfmon_init(struct oprofile_operations * ops) +int perfmon_init(struct oprofile_operations *ops) { int ret = pfm_register_buffer_fmt(&oprofile_fmt); if (ret) diff --git a/arch/m32r/oprofile/init.c b/arch/m32r/oprofile/init.c index b7773e45c43..fa56860f425 100644 --- a/arch/m32r/oprofile/init.c +++ b/arch/m32r/oprofile/init.c @@ -12,7 +12,7 @@ #include #include -int __init oprofile_arch_init(struct oprofile_operations * ops) +int __init oprofile_arch_init(struct oprofile_operations *ops) { return -ENODEV; } diff --git a/arch/mips/oprofile/common.c b/arch/mips/oprofile/common.c index dd2fbd6645c..3bf3354547f 100644 --- a/arch/mips/oprofile/common.c +++ b/arch/mips/oprofile/common.c @@ -32,7 +32,7 @@ static int op_mips_setup(void) return 0; } -static int op_mips_create_files(struct super_block * sb, struct dentry * root) +static int op_mips_create_files(struct super_block *sb, struct dentry *root) { int i; diff --git a/arch/mips/oprofile/op_impl.h b/arch/mips/oprofile/op_impl.h index 2bfc17c3010..f04b54fb37d 100644 --- a/arch/mips/oprofile/op_impl.h +++ b/arch/mips/oprofile/op_impl.h @@ -27,7 +27,7 @@ struct op_counter_config { /* Per-architecture configury and hooks. */ struct op_mips_model { void (*reg_setup) (struct op_counter_config *); - void (*cpu_setup) (void * dummy); + void (*cpu_setup) (void *dummy); int (*init)(void); void (*exit)(void); void (*cpu_start)(void *args); diff --git a/arch/mips/oprofile/op_model_rm9000.c b/arch/mips/oprofile/op_model_rm9000.c index a45d3202894..3aa81384966 100644 --- a/arch/mips/oprofile/op_model_rm9000.c +++ b/arch/mips/oprofile/op_model_rm9000.c @@ -80,7 +80,7 @@ static void rm9000_cpu_stop(void *args) write_c0_perfcontrol(0); } -static irqreturn_t rm9000_perfcount_handler(int irq, void * dev_id) +static irqreturn_t rm9000_perfcount_handler(int irq, void *dev_id) { unsigned int control = read_c0_perfcontrol(); struct pt_regs *regs = get_irq_regs(); diff --git a/arch/parisc/oprofile/init.c b/arch/parisc/oprofile/init.c index 113f5139f55..026cba2af07 100644 --- a/arch/parisc/oprofile/init.c +++ b/arch/parisc/oprofile/init.c @@ -12,7 +12,7 @@ #include #include -int __init oprofile_arch_init(struct oprofile_operations * ops) +int __init oprofile_arch_init(struct oprofile_operations *ops) { return -ENODEV; } diff --git a/arch/powerpc/oprofile/op_model_cell.c b/arch/powerpc/oprofile/op_model_cell.c index 5ff4de3eb3b..35141a8bc3d 100644 --- a/arch/powerpc/oprofile/op_model_cell.c +++ b/arch/powerpc/oprofile/op_model_cell.c @@ -404,7 +404,7 @@ set_count_mode(u32 kernel, u32 user) } } -static inline void enable_ctr(u32 cpu, u32 ctr, u32 * pm07_cntrl) +static inline void enable_ctr(u32 cpu, u32 ctr, u32 *pm07_cntrl) { pm07_cntrl[ctr] |= CBE_PM_CTR_ENABLE; diff --git a/arch/sparc/oprofile/init.c b/arch/sparc/oprofile/init.c index 9ab815b95b5..17bb6035069 100644 --- a/arch/sparc/oprofile/init.c +++ b/arch/sparc/oprofile/init.c @@ -12,7 +12,7 @@ #include #include -int __init oprofile_arch_init(struct oprofile_operations * ops) +int __init oprofile_arch_init(struct oprofile_operations *ops) { return -ENODEV; } diff --git a/arch/sparc64/oprofile/init.c b/arch/sparc64/oprofile/init.c index 9ab815b95b5..17bb6035069 100644 --- a/arch/sparc64/oprofile/init.c +++ b/arch/sparc64/oprofile/init.c @@ -12,7 +12,7 @@ #include #include -int __init oprofile_arch_init(struct oprofile_operations * ops) +int __init oprofile_arch_init(struct oprofile_operations *ops) { return -ENODEV; } diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index e2095cba409..36e324139f7 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -53,7 +53,7 @@ struct frame_head { } __attribute__((packed)); static struct frame_head * -dump_user_backtrace(struct frame_head * head) +dump_user_backtrace(struct frame_head *head) { struct frame_head bufhead[2]; diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index b9a810b3326..a2e83afbe3e 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -69,7 +69,7 @@ static unsigned long reset_value[NUM_COUNTERS]; data. Should then be added to linux/oprofile.h. */ extern void oprofile_add_ibs_sample(struct pt_regs *const regs, - unsigned int * const ibs_sample, int ibs_code); + unsigned int *const ibs_sample, int ibs_code); struct ibs_fetch_sample { /* MSRC001_1031 IBS Fetch Linear Address Register */ @@ -469,9 +469,9 @@ static void clear_ibs_nmi(void) on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); } -static int (*create_arch_files)(struct super_block * sb, struct dentry * root); +static int (*create_arch_files)(struct super_block *sb, struct dentry *root); -static int setup_ibs_files(struct super_block * sb, struct dentry * root) +static int setup_ibs_files(struct super_block *sb, struct dentry *root) { struct dentry *dir; int ret = 0; diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 05a0261ba0c..24ccdebf3ac 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -22,8 +22,8 @@ struct op_msr { }; struct op_msrs { - struct op_msr * counters; - struct op_msr * controls; + struct op_msr *counters; + struct op_msr *controls; }; struct pt_regs; diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c index d6c68270b3d..b47ce038490 100644 --- a/drivers/oprofile/cpu_buffer.c +++ b/drivers/oprofile/cpu_buffer.c @@ -112,7 +112,7 @@ void end_cpu_work(void) } /* Resets the cpu buffer to a sane state. */ -void cpu_buffer_reset(struct oprofile_cpu_buffer * cpu_buf) +void cpu_buffer_reset(struct oprofile_cpu_buffer *cpu_buf) { /* reset these to invalid values; the next sample * collected will populate the buffer with proper @@ -123,7 +123,7 @@ void cpu_buffer_reset(struct oprofile_cpu_buffer * cpu_buf) } /* compute number of available slots in cpu_buffer queue */ -static unsigned long nr_available_slots(struct oprofile_cpu_buffer const * b) +static unsigned long nr_available_slots(struct oprofile_cpu_buffer const *b) { unsigned long head = b->head_pos; unsigned long tail = b->tail_pos; @@ -134,7 +134,7 @@ static unsigned long nr_available_slots(struct oprofile_cpu_buffer const * b) return tail + (b->buffer_size - head) - 1; } -static void increment_head(struct oprofile_cpu_buffer * b) +static void increment_head(struct oprofile_cpu_buffer *b) { unsigned long new_head = b->head_pos + 1; @@ -149,17 +149,17 @@ static void increment_head(struct oprofile_cpu_buffer * b) } static inline void -add_sample(struct oprofile_cpu_buffer * cpu_buf, +add_sample(struct oprofile_cpu_buffer *cpu_buf, unsigned long pc, unsigned long event) { - struct op_sample * entry = &cpu_buf->buffer[cpu_buf->head_pos]; + struct op_sample *entry = &cpu_buf->buffer[cpu_buf->head_pos]; entry->eip = pc; entry->event = event; increment_head(cpu_buf); } static inline void -add_code(struct oprofile_cpu_buffer * buffer, unsigned long value) +add_code(struct oprofile_cpu_buffer *buffer, unsigned long value) { add_sample(buffer, ESCAPE_CODE, value); } @@ -173,10 +173,10 @@ add_code(struct oprofile_cpu_buffer * buffer, unsigned long value) * pc. We tag this in the buffer by generating kernel enter/exit * events whenever is_kernel changes */ -static int log_sample(struct oprofile_cpu_buffer * cpu_buf, unsigned long pc, +static int log_sample(struct oprofile_cpu_buffer *cpu_buf, unsigned long pc, int is_kernel, unsigned long event) { - struct task_struct * task; + struct task_struct *task; cpu_buf->sample_received++; @@ -222,7 +222,7 @@ static int oprofile_begin_trace(struct oprofile_cpu_buffer *cpu_buf) return 1; } -static void oprofile_end_trace(struct oprofile_cpu_buffer * cpu_buf) +static void oprofile_end_trace(struct oprofile_cpu_buffer *cpu_buf) { cpu_buf->tracing = 0; } @@ -260,7 +260,7 @@ void oprofile_add_sample(struct pt_regs * const regs, unsigned long event) #define MAX_IBS_SAMPLE_SIZE 14 void oprofile_add_ibs_sample(struct pt_regs *const regs, - unsigned int * const ibs_sample, int ibs_code) + unsigned int *const ibs_sample, int ibs_code) { int is_kernel = !user_mode(regs); struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer); @@ -345,7 +345,7 @@ void oprofile_add_trace(unsigned long pc) */ static void wq_sync_buffer(struct work_struct *work) { - struct oprofile_cpu_buffer * b = + struct oprofile_cpu_buffer *b = container_of(work, struct oprofile_cpu_buffer, work.work); if (b->cpu != smp_processor_id()) { printk(KERN_DEBUG "WQ on CPU%d, prefer CPU%d\n", diff --git a/drivers/oprofile/cpu_buffer.h b/drivers/oprofile/cpu_buffer.h index 9c44d004da6..9bc6bb20b6d 100644 --- a/drivers/oprofile/cpu_buffer.h +++ b/drivers/oprofile/cpu_buffer.h @@ -36,10 +36,10 @@ struct oprofile_cpu_buffer { volatile unsigned long head_pos; volatile unsigned long tail_pos; unsigned long buffer_size; - struct task_struct * last_task; + struct task_struct *last_task; int last_is_kernel; int tracing; - struct op_sample * buffer; + struct op_sample *buffer; unsigned long sample_received; unsigned long sample_lost_overflow; unsigned long backtrace_aborted; @@ -50,7 +50,7 @@ struct oprofile_cpu_buffer { DECLARE_PER_CPU(struct oprofile_cpu_buffer, cpu_buffer); -void cpu_buffer_reset(struct oprofile_cpu_buffer * cpu_buf); +void cpu_buffer_reset(struct oprofile_cpu_buffer *cpu_buf); /* transient events for the CPU buffer -> event buffer */ #define CPU_IS_KERNEL 1 diff --git a/drivers/oprofile/event_buffer.c b/drivers/oprofile/event_buffer.c index 8d692a5c8e7..c9329f4e090 100644 --- a/drivers/oprofile/event_buffer.c +++ b/drivers/oprofile/event_buffer.c @@ -28,7 +28,7 @@ DEFINE_MUTEX(buffer_mutex); static unsigned long buffer_opened; static DECLARE_WAIT_QUEUE_HEAD(buffer_wait); -static unsigned long * event_buffer; +static unsigned long *event_buffer; static unsigned long buffer_size; static unsigned long buffer_watershed; static size_t buffer_pos; @@ -98,7 +98,7 @@ void free_event_buffer(void) } -static int event_buffer_open(struct inode * inode, struct file * file) +static int event_buffer_open(struct inode *inode, struct file *file) { int err = -EPERM; @@ -134,7 +134,7 @@ out: } -static int event_buffer_release(struct inode * inode, struct file * file) +static int event_buffer_release(struct inode *inode, struct file *file) { oprofile_stop(); oprofile_shutdown(); @@ -146,8 +146,8 @@ static int event_buffer_release(struct inode * inode, struct file * file) } -static ssize_t event_buffer_read(struct file * file, char __user * buf, - size_t count, loff_t * offset) +static ssize_t event_buffer_read(struct file *file, char __user *buf, + size_t count, loff_t *offset) { int retval = -EINVAL; size_t const max = buffer_size * sizeof(unsigned long); diff --git a/drivers/oprofile/oprof.h b/drivers/oprofile/oprof.h index 18323650806..7a44ddba0be 100644 --- a/drivers/oprofile/oprof.h +++ b/drivers/oprofile/oprof.h @@ -31,8 +31,8 @@ extern unsigned long backtrace_depth; struct super_block; struct dentry; -void oprofile_create_files(struct super_block * sb, struct dentry * root); -void oprofile_timer_init(struct oprofile_operations * ops); +void oprofile_create_files(struct super_block *sb, struct dentry *root); +void oprofile_timer_init(struct oprofile_operations *ops); int oprofile_set_backtrace(unsigned long depth); diff --git a/drivers/oprofile/oprofile_files.c b/drivers/oprofile/oprofile_files.c index ef953ba5ab6..241804abbb5 100644 --- a/drivers/oprofile/oprofile_files.c +++ b/drivers/oprofile/oprofile_files.c @@ -18,13 +18,13 @@ unsigned long fs_buffer_size = 131072; unsigned long fs_cpu_buffer_size = 8192; unsigned long fs_buffer_watershed = 32768; /* FIXME: tune */ -static ssize_t depth_read(struct file * file, char __user * buf, size_t count, loff_t * offset) +static ssize_t depth_read(struct file *file, char __user *buf, size_t count, loff_t *offset) { return oprofilefs_ulong_to_user(backtrace_depth, buf, count, offset); } -static ssize_t depth_write(struct file * file, char const __user * buf, size_t count, loff_t * offset) +static ssize_t depth_write(struct file *file, char const __user *buf, size_t count, loff_t *offset) { unsigned long val; int retval; @@ -50,7 +50,7 @@ static const struct file_operations depth_fops = { }; -static ssize_t pointer_size_read(struct file * file, char __user * buf, size_t count, loff_t * offset) +static ssize_t pointer_size_read(struct file *file, char __user *buf, size_t count, loff_t *offset) { return oprofilefs_ulong_to_user(sizeof(void *), buf, count, offset); } @@ -61,7 +61,7 @@ static const struct file_operations pointer_size_fops = { }; -static ssize_t cpu_type_read(struct file * file, char __user * buf, size_t count, loff_t * offset) +static ssize_t cpu_type_read(struct file *file, char __user *buf, size_t count, loff_t *offset) { return oprofilefs_str_to_user(oprofile_ops.cpu_type, buf, count, offset); } @@ -72,13 +72,13 @@ static const struct file_operations cpu_type_fops = { }; -static ssize_t enable_read(struct file * file, char __user * buf, size_t count, loff_t * offset) +static ssize_t enable_read(struct file *file, char __user *buf, size_t count, loff_t *offset) { return oprofilefs_ulong_to_user(oprofile_started, buf, count, offset); } -static ssize_t enable_write(struct file * file, char const __user * buf, size_t count, loff_t * offset) +static ssize_t enable_write(struct file *file, char const __user *buf, size_t count, loff_t *offset) { unsigned long val; int retval; @@ -107,7 +107,7 @@ static const struct file_operations enable_fops = { }; -static ssize_t dump_write(struct file * file, char const __user * buf, size_t count, loff_t * offset) +static ssize_t dump_write(struct file *file, char const __user *buf, size_t count, loff_t *offset) { wake_up_buffer_waiter(); return count; @@ -118,7 +118,7 @@ static const struct file_operations dump_fops = { .write = dump_write, }; -void oprofile_create_files(struct super_block * sb, struct dentry * root) +void oprofile_create_files(struct super_block *sb, struct dentry *root) { oprofilefs_create_file(sb, root, "enable", &enable_fops); oprofilefs_create_file_perm(sb, root, "dump", &dump_fops, 0666); diff --git a/drivers/oprofile/oprofile_stats.c b/drivers/oprofile/oprofile_stats.c index f99b28e7b79..e0c45498d17 100644 --- a/drivers/oprofile/oprofile_stats.c +++ b/drivers/oprofile/oprofile_stats.c @@ -19,7 +19,7 @@ struct oprofile_stat_struct oprofile_stats; void oprofile_reset_stats(void) { - struct oprofile_cpu_buffer * cpu_buf; + struct oprofile_cpu_buffer *cpu_buf; int i; for_each_possible_cpu(i) { @@ -36,11 +36,11 @@ void oprofile_reset_stats(void) } -void oprofile_create_stats_files(struct super_block * sb, struct dentry * root) +void oprofile_create_stats_files(struct super_block *sb, struct dentry *root) { - struct oprofile_cpu_buffer * cpu_buf; - struct dentry * cpudir; - struct dentry * dir; + struct oprofile_cpu_buffer *cpu_buf; + struct dentry *cpudir; + struct dentry *dir; char buf[10]; int i; diff --git a/drivers/oprofile/oprofile_stats.h b/drivers/oprofile/oprofile_stats.h index 6d755a633f1..54e59c29b43 100644 --- a/drivers/oprofile/oprofile_stats.h +++ b/drivers/oprofile/oprofile_stats.h @@ -28,6 +28,6 @@ struct super_block; struct dentry; /* create the stats/ dir */ -void oprofile_create_stats_files(struct super_block * sb, struct dentry * root); +void oprofile_create_stats_files(struct super_block *sb, struct dentry *root); #endif /* OPROFILE_STATS_H */ diff --git a/drivers/oprofile/oprofilefs.c b/drivers/oprofile/oprofilefs.c index 8543cb26cf3..a275a3aa5f0 100644 --- a/drivers/oprofile/oprofilefs.c +++ b/drivers/oprofile/oprofilefs.c @@ -23,9 +23,9 @@ DEFINE_SPINLOCK(oprofilefs_lock); -static struct inode * oprofilefs_get_inode(struct super_block * sb, int mode) +static struct inode *oprofilefs_get_inode(struct super_block *sb, int mode) { - struct inode * inode = new_inode(sb); + struct inode *inode = new_inode(sb); if (inode) { inode->i_mode = mode; @@ -44,7 +44,7 @@ static struct super_operations s_ops = { }; -ssize_t oprofilefs_str_to_user(char const * str, char __user * buf, size_t count, loff_t * offset) +ssize_t oprofilefs_str_to_user(char const *str, char __user *buf, size_t count, loff_t *offset) { return simple_read_from_buffer(buf, count, offset, str, strlen(str)); } @@ -52,7 +52,7 @@ ssize_t oprofilefs_str_to_user(char const * str, char __user * buf, size_t count #define TMPBUFSIZE 50 -ssize_t oprofilefs_ulong_to_user(unsigned long val, char __user * buf, size_t count, loff_t * offset) +ssize_t oprofilefs_ulong_to_user(unsigned long val, char __user *buf, size_t count, loff_t *offset) { char tmpbuf[TMPBUFSIZE]; size_t maxlen = snprintf(tmpbuf, TMPBUFSIZE, "%lu\n", val); @@ -62,7 +62,7 @@ ssize_t oprofilefs_ulong_to_user(unsigned long val, char __user * buf, size_t co } -int oprofilefs_ulong_from_user(unsigned long * val, char const __user * buf, size_t count) +int oprofilefs_ulong_from_user(unsigned long *val, char const __user *buf, size_t count) { char tmpbuf[TMPBUFSIZE]; unsigned long flags; @@ -85,16 +85,16 @@ int oprofilefs_ulong_from_user(unsigned long * val, char const __user * buf, siz } -static ssize_t ulong_read_file(struct file * file, char __user * buf, size_t count, loff_t * offset) +static ssize_t ulong_read_file(struct file *file, char __user *buf, size_t count, loff_t *offset) { - unsigned long * val = file->private_data; + unsigned long *val = file->private_data; return oprofilefs_ulong_to_user(*val, buf, count, offset); } -static ssize_t ulong_write_file(struct file * file, char const __user * buf, size_t count, loff_t * offset) +static ssize_t ulong_write_file(struct file *file, char const __user *buf, size_t count, loff_t *offset) { - unsigned long * value = file->private_data; + unsigned long *value = file->private_data; int retval; if (*offset) @@ -108,7 +108,7 @@ static ssize_t ulong_write_file(struct file * file, char const __user * buf, siz } -static int default_open(struct inode * inode, struct file * filp) +static int default_open(struct inode *inode, struct file *filp) { if (inode->i_private) filp->private_data = inode->i_private; @@ -129,12 +129,12 @@ static const struct file_operations ulong_ro_fops = { }; -static struct dentry * __oprofilefs_create_file(struct super_block * sb, - struct dentry * root, char const * name, const struct file_operations * fops, +static struct dentry *__oprofilefs_create_file(struct super_block *sb, + struct dentry *root, char const *name, const struct file_operations *fops, int perm) { - struct dentry * dentry; - struct inode * inode; + struct dentry *dentry; + struct inode *inode; dentry = d_alloc_name(root, name); if (!dentry) @@ -150,10 +150,10 @@ static struct dentry * __oprofilefs_create_file(struct super_block * sb, } -int oprofilefs_create_ulong(struct super_block * sb, struct dentry * root, - char const * name, unsigned long * val) +int oprofilefs_create_ulong(struct super_block *sb, struct dentry *root, + char const *name, unsigned long *val) { - struct dentry * d = __oprofilefs_create_file(sb, root, name, + struct dentry *d = __oprofilefs_create_file(sb, root, name, &ulong_fops, 0644); if (!d) return -EFAULT; @@ -163,10 +163,10 @@ int oprofilefs_create_ulong(struct super_block * sb, struct dentry * root, } -int oprofilefs_create_ro_ulong(struct super_block * sb, struct dentry * root, - char const * name, unsigned long * val) +int oprofilefs_create_ro_ulong(struct super_block *sb, struct dentry *root, + char const *name, unsigned long *val) { - struct dentry * d = __oprofilefs_create_file(sb, root, name, + struct dentry *d = __oprofilefs_create_file(sb, root, name, &ulong_ro_fops, 0444); if (!d) return -EFAULT; @@ -176,9 +176,9 @@ int oprofilefs_create_ro_ulong(struct super_block * sb, struct dentry * root, } -static ssize_t atomic_read_file(struct file * file, char __user * buf, size_t count, loff_t * offset) +static ssize_t atomic_read_file(struct file *file, char __user *buf, size_t count, loff_t *offset) { - atomic_t * val = file->private_data; + atomic_t *val = file->private_data; return oprofilefs_ulong_to_user(atomic_read(val), buf, count, offset); } @@ -189,10 +189,10 @@ static const struct file_operations atomic_ro_fops = { }; -int oprofilefs_create_ro_atomic(struct super_block * sb, struct dentry * root, - char const * name, atomic_t * val) +int oprofilefs_create_ro_atomic(struct super_block *sb, struct dentry *root, + char const *name, atomic_t *val) { - struct dentry * d = __oprofilefs_create_file(sb, root, name, + struct dentry *d = __oprofilefs_create_file(sb, root, name, &atomic_ro_fops, 0444); if (!d) return -EFAULT; @@ -202,8 +202,8 @@ int oprofilefs_create_ro_atomic(struct super_block * sb, struct dentry * root, } -int oprofilefs_create_file(struct super_block * sb, struct dentry * root, - char const * name, const struct file_operations * fops) +int oprofilefs_create_file(struct super_block *sb, struct dentry *root, + char const *name, const struct file_operations *fops) { if (!__oprofilefs_create_file(sb, root, name, fops, 0644)) return -EFAULT; @@ -211,8 +211,8 @@ int oprofilefs_create_file(struct super_block * sb, struct dentry * root, } -int oprofilefs_create_file_perm(struct super_block * sb, struct dentry * root, - char const * name, const struct file_operations * fops, int perm) +int oprofilefs_create_file_perm(struct super_block *sb, struct dentry *root, + char const *name, const struct file_operations *fops, int perm) { if (!__oprofilefs_create_file(sb, root, name, fops, perm)) return -EFAULT; @@ -220,11 +220,11 @@ int oprofilefs_create_file_perm(struct super_block * sb, struct dentry * root, } -struct dentry * oprofilefs_mkdir(struct super_block * sb, - struct dentry * root, char const * name) +struct dentry *oprofilefs_mkdir(struct super_block *sb, + struct dentry *root, char const *name) { - struct dentry * dentry; - struct inode * inode; + struct dentry *dentry; + struct inode *inode; dentry = d_alloc_name(root, name); if (!dentry) @@ -241,10 +241,10 @@ struct dentry * oprofilefs_mkdir(struct super_block * sb, } -static int oprofilefs_fill_super(struct super_block * sb, void * data, int silent) +static int oprofilefs_fill_super(struct super_block *sb, void *data, int silent) { - struct inode * root_inode; - struct dentry * root_dentry; + struct inode *root_inode; + struct dentry *root_dentry; sb->s_blocksize = PAGE_CACHE_SIZE; sb->s_blocksize_bits = PAGE_CACHE_SHIFT; diff --git a/drivers/oprofile/timer_int.c b/drivers/oprofile/timer_int.c index 710a45f0d73..7258b141a51 100644 --- a/drivers/oprofile/timer_int.c +++ b/drivers/oprofile/timer_int.c @@ -35,7 +35,7 @@ static void timer_stop(void) } -void __init oprofile_timer_init(struct oprofile_operations * ops) +void __init oprofile_timer_init(struct oprofile_operations *ops) { ops->create_files = NULL; ops->setup = NULL; -- cgit v1.2.3-70-g09d2 From 69046d430417c30f48867fc52e892c9050b3a29b Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 5 Sep 2008 12:17:40 +0200 Subject: x86/oprofile: reordering functions in nmi_int.c No functional changes. The intension is to remove static function declarations. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 147 +++++++++++++++++++++----------------------- 1 file changed, 71 insertions(+), 76 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 57f6c908808..370d832f398 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -28,85 +28,9 @@ static struct op_x86_model_spec const *model; static DEFINE_PER_CPU(struct op_msrs, cpu_msrs); static DEFINE_PER_CPU(unsigned long, saved_lvtpc); -static int nmi_start(void); -static void nmi_stop(void); -static void nmi_cpu_start(void *dummy); -static void nmi_cpu_stop(void *dummy); - /* 0 == registered but off, 1 == registered and on */ static int nmi_enabled = 0; -#ifdef CONFIG_SMP -static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action, - void *data) -{ - int cpu = (unsigned long)data; - switch (action) { - case CPU_DOWN_FAILED: - case CPU_ONLINE: - smp_call_function_single(cpu, nmi_cpu_start, NULL, 0); - break; - case CPU_DOWN_PREPARE: - smp_call_function_single(cpu, nmi_cpu_stop, NULL, 1); - break; - } - return NOTIFY_DONE; -} - -static struct notifier_block oprofile_cpu_nb = { - .notifier_call = oprofile_cpu_notifier -}; -#endif - -#ifdef CONFIG_PM - -static int nmi_suspend(struct sys_device *dev, pm_message_t state) -{ - /* Only one CPU left, just stop that one */ - if (nmi_enabled == 1) - nmi_cpu_stop(NULL); - return 0; -} - -static int nmi_resume(struct sys_device *dev) -{ - if (nmi_enabled == 1) - nmi_cpu_start(NULL); - return 0; -} - -static struct sysdev_class oprofile_sysclass = { - .name = "oprofile", - .resume = nmi_resume, - .suspend = nmi_suspend, -}; - -static struct sys_device device_oprofile = { - .id = 0, - .cls = &oprofile_sysclass, -}; - -static int __init init_sysfs(void) -{ - int error; - - error = sysdev_class_register(&oprofile_sysclass); - if (!error) - error = sysdev_register(&device_oprofile); - return error; -} - -static void exit_sysfs(void) -{ - sysdev_unregister(&device_oprofile); - sysdev_class_unregister(&oprofile_sysclass); -} - -#else -#define init_sysfs() do { } while (0) -#define exit_sysfs() do { } while (0) -#endif /* CONFIG_PM */ - static int profile_exceptions_notify(struct notifier_block *self, unsigned long val, void *data) { @@ -361,6 +285,77 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root) return 0; } +#ifdef CONFIG_SMP +static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action, + void *data) +{ + int cpu = (unsigned long)data; + switch (action) { + case CPU_DOWN_FAILED: + case CPU_ONLINE: + smp_call_function_single(cpu, nmi_cpu_start, NULL, 0); + break; + case CPU_DOWN_PREPARE: + smp_call_function_single(cpu, nmi_cpu_stop, NULL, 1); + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block oprofile_cpu_nb = { + .notifier_call = oprofile_cpu_notifier +}; +#endif + +#ifdef CONFIG_PM + +static int nmi_suspend(struct sys_device *dev, pm_message_t state) +{ + /* Only one CPU left, just stop that one */ + if (nmi_enabled == 1) + nmi_cpu_stop(NULL); + return 0; +} + +static int nmi_resume(struct sys_device *dev) +{ + if (nmi_enabled == 1) + nmi_cpu_start(NULL); + return 0; +} + +static struct sysdev_class oprofile_sysclass = { + .name = "oprofile", + .resume = nmi_resume, + .suspend = nmi_suspend, +}; + +static struct sys_device device_oprofile = { + .id = 0, + .cls = &oprofile_sysclass, +}; + +static int __init init_sysfs(void) +{ + int error; + + error = sysdev_class_register(&oprofile_sysclass); + if (!error) + error = sysdev_register(&device_oprofile); + return error; +} + +static void exit_sysfs(void) +{ + sysdev_unregister(&device_oprofile); + sysdev_class_unregister(&oprofile_sysclass); +} + +#else +#define init_sysfs() do { } while (0) +#define exit_sysfs() do { } while (0) +#endif /* CONFIG_PM */ + static int p4force; module_param(p4force, int, 0); -- cgit v1.2.3-70-g09d2 From 5f87dfb79f829339508a5d989b8252eb30842587 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Wed, 15 Oct 2008 08:15:51 -0500 Subject: x86/oprofile: add the logic for enabling additional IBS bits This patch adds the logic for enabling additional IBS control bits : * IBS-Fetch IbsRandEn bit (bit 57) * IBS-Op IbsOpCntCtl bit (bit 19) Signed-off-by: Suravee Suthikulpanit Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index a2e83afbe3e..509513760a6 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -310,12 +310,15 @@ static void op_amd_start(struct op_msrs const * const msrs) #ifdef CONFIG_OPROFILE_IBS if (ibs_allowed && ibs_config.fetch_enabled) { low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; - high = IBS_FETCH_HIGH_ENABLE; + high = ((ibs_config.rand_en & 0x1) << 25) /* bit 57 */ + + IBS_FETCH_HIGH_ENABLE; wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); } if (ibs_allowed && ibs_config.op_enabled) { - low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF) + IBS_OP_LOW_ENABLE; + low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF) + + ((ibs_config.dispatched_ops & 0x1) << 19) /* bit 19 */ + + IBS_OP_LOW_ENABLE; high = 0; wrmsr(MSR_AMD64_IBSOPCTL, low, high); } -- cgit v1.2.3-70-g09d2 From 1339c367a842a9fc34b8fcfa1abadb07b11848ad Mon Sep 17 00:00:00 2001 From: Pavel Vasilyev Date: Wed, 15 Oct 2008 17:33:48 -0400 Subject: fix CONFIG_MMCONFIG=n build warning arch/x86/kernel/acpi/boot.c:100: warning: 'acpi_mcfg_64bit_base_addr' defined but not used http://bugzilla.kernel.org/show_bug.cgi?id=11743 Signed-off-by: Pavel Vasilyev Signed-off-by: Len Brown --- arch/x86/kernel/acpi/boot.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index c102af85df9..0c2742f8c4d 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -97,7 +97,6 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; #warning ACPI uses CMPXCHG, i486 and later hardware #endif -static int acpi_mcfg_64bit_base_addr __initdata = FALSE; /* -------------------------------------------------------------------------- Boot-time Configuration @@ -156,6 +155,9 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size) } #ifdef CONFIG_PCI_MMCONFIG + +static int acpi_mcfg_64bit_base_addr __initdata = FALSE; + /* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ struct acpi_mcfg_allocation *pci_mmcfg_config; int pci_mmcfg_config_num; -- cgit v1.2.3-70-g09d2 From fe648be019119722ec0ac54e3a4b2e5bf5168589 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:49:41 -0700 Subject: x86: add after_bootmem flag for 32bit to prepare to use dyn_array support etc. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 8396868e82c..91343c2694b 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -66,6 +66,7 @@ static unsigned long __meminitdata table_end; static unsigned long __meminitdata table_top; static int __initdata after_init_bootmem; +int after_bootmem; static __init void *alloc_low_page(unsigned long *phys) { @@ -988,6 +989,8 @@ void __init mem_init(void) set_highmem_pages_init(); + after_bootmem = 1; + codesize = (unsigned long) &_etext - (unsigned long) &_text; datasize = (unsigned long) &_edata - (unsigned long) &_etext; initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; -- cgit v1.2.3-70-g09d2 From 1f3fcd4b1adc972d5c6a34cfed98931c46575b49 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:49:44 -0700 Subject: add per_cpu_dyn_array support allow dyn-array in per_cpu area, allocated dynamically. usage: | /* in .h */ | struct kernel_stat { | struct cpu_usage_stat cpustat; | unsigned int *irqs; | }; | | /* in .c */ | DEFINE_PER_CPU(struct kernel_stat, kstat); | | DEFINE_PER_CPU_DYN_ARRAY_ADDR(per_cpu__kstat_irqs, per_cpu__kstat.irqs, sizeof(unsigned int), nr_irqs, sizeof(unsigned long), NULL); after setup_percpu()/per_cpu_alloc_dyn_array(), the dyn_array in per_cpu area is ready to use. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 7 +++-- include/asm-generic/vmlinux.lds.h | 6 ++++ include/linux/init.h | 27 +++++++++++++++-- init/main.c | 63 +++++++++++++++++++++++++++++++++++++-- 4 files changed, 96 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 0e67f72d931..13ba7a83808 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -140,7 +140,7 @@ static void __init setup_cpu_pda_map(void) */ void __init setup_per_cpu_areas(void) { - ssize_t size = PERCPU_ENOUGH_ROOM; + ssize_t size, old_size; char *ptr; int cpu; @@ -148,7 +148,8 @@ void __init setup_per_cpu_areas(void) setup_cpu_pda_map(); /* Copy section for each CPU (we discard the original) */ - size = PERCPU_ENOUGH_ROOM; + old_size = PERCPU_ENOUGH_ROOM; + size = old_size + per_cpu_dyn_array_size(); printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n", size); @@ -176,6 +177,8 @@ void __init setup_per_cpu_areas(void) per_cpu_offset(cpu) = ptr - __per_cpu_start; memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); + per_cpu_alloc_dyn_array(cpu, ptr + old_size); + } printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n", diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 7881406c03e..c68eda9d9a9 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -216,6 +216,12 @@ VMLINUX_SYMBOL(__dyn_array_start) = .; \ *(.dyn_array.init) \ VMLINUX_SYMBOL(__dyn_array_end) = .; \ + } \ + . = ALIGN((align)); \ + .per_cpu_dyn_array.init : AT(ADDR(.per_cpu_dyn_array.init) - LOAD_OFFSET) { \ + VMLINUX_SYMBOL(__per_cpu_dyn_array_start) = .; \ + *(.per_cpu_dyn_array.init) \ + VMLINUX_SYMBOL(__per_cpu_dyn_array_end) = .; \ } #define SECURITY_INIT \ .security_initcall.init : AT(ADDR(.security_initcall.init) - LOAD_OFFSET) { \ diff --git a/include/linux/init.h b/include/linux/init.h index cf9fa7f174a..332806826b8 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -255,12 +255,13 @@ struct dyn_array { void (*init_work)(void *); }; extern struct dyn_array *__dyn_array_start[], *__dyn_array_end[]; +extern struct dyn_array *__per_cpu_dyn_array_start[], *__per_cpu_dyn_array_end[]; -#define DEFINE_DYN_ARRAY(nameX, sizeX, nrX, alignX, init_workX) \ +#define DEFINE_DYN_ARRAY_ADDR(nameX, addrX, sizeX, nrX, alignX, init_workX) \ static struct dyn_array __dyn_array_##nameX __initdata = \ - { .name = (void **)&nameX,\ + { .name = (void **)&(nameX),\ .size = sizeX,\ - .nr = &nrX,\ + .nr = &(nrX),\ .align = alignX,\ .init_work = init_workX,\ }; \ @@ -268,7 +269,27 @@ extern struct dyn_array *__dyn_array_start[], *__dyn_array_end[]; __attribute__((__section__(".dyn_array.init"))) = \ &__dyn_array_##nameX +#define DEFINE_DYN_ARRAY(nameX, sizeX, nrX, alignX, init_workX) \ + DEFINE_DYN_ARRAY_ADDR(nameX, nameX, sizeX, nrX, alignX, init_workX) + +#define DEFINE_PER_CPU_DYN_ARRAY_ADDR(nameX, addrX, sizeX, nrX, alignX, init_workX) \ + static struct dyn_array __per_cpu_dyn_array_##nameX __initdata = \ + { .name = (void **)&(addrX),\ + .size = sizeX,\ + .nr = &(nrX),\ + .align = alignX,\ + .init_work = init_workX,\ + }; \ + static struct dyn_array *__per_cpu_dyn_array_ptr_##nameX __used \ + __attribute__((__section__(".per_cpu_dyn_array.init"))) = \ + &__per_cpu_dyn_array_##nameX + +#define DEFINE_PER_CPU_DYN_ARRAY(nameX, sizeX, nrX, alignX, init_workX) \ + DEFINE_PER_CPU_DYN_ARRAY_ADDR(nameX, nameX, nrX, alignX, init_workX) + extern void pre_alloc_dyn_array(void); +extern unsigned long per_cpu_dyn_array_size(void); +extern void per_cpu_alloc_dyn_array(int cpu, char *ptr); #endif /* __ASSEMBLY__ */ /** diff --git a/init/main.c b/init/main.c index 638d3a78641..416bca4f734 100644 --- a/init/main.c +++ b/init/main.c @@ -391,17 +391,19 @@ EXPORT_SYMBOL(__per_cpu_offset); static void __init setup_per_cpu_areas(void) { - unsigned long size, i; + unsigned long size, i, old_size; char *ptr; unsigned long nr_possible_cpus = num_possible_cpus(); /* Copy section for each CPU (we discard the original) */ - size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE); + old_size = PERCPU_ENOUGH_ROOM; + size = ALIGN(old_size + per_cpu_dyn_array_size(), PAGE_SIZE); ptr = alloc_bootmem_pages(size * nr_possible_cpus); for_each_possible_cpu(i) { __per_cpu_offset[i] = ptr - __per_cpu_start; memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); + per_cpu_alloc_dyn_array(i, ptr + old_size); ptr += size; } } @@ -559,6 +561,63 @@ void pre_alloc_dyn_array(void) #endif } +unsigned long per_cpu_dyn_array_size(void) +{ + unsigned long total_size = 0; +#ifdef CONFIG_HAVE_DYN_ARRAY + unsigned long size; + struct dyn_array **daa; + + for (daa = __per_cpu_dyn_array_start ; daa < __per_cpu_dyn_array_end; daa++) { + struct dyn_array *da = *daa; + + size = da->size * (*da->nr); + print_fn_descriptor_symbol("per_cpu_dyna_array %s ", da->name); + printk(KERN_CONT "size:%#lx nr:%d align:%#lx\n", + da->size, *da->nr, da->align); + total_size += roundup(size, da->align); + } + if (total_size) + printk(KERN_DEBUG "per_cpu_dyna_array total_size: %#lx\n", + total_size); +#endif + return total_size; +} + +void per_cpu_alloc_dyn_array(int cpu, char *ptr) +{ +#ifdef CONFIG_HAVE_DYN_ARRAY + unsigned long size, phys; + struct dyn_array **daa; + unsigned long addr; + void **array; + + phys = virt_to_phys(ptr); + + for (daa = __per_cpu_dyn_array_start ; daa < __per_cpu_dyn_array_end; daa++) { + struct dyn_array *da = *daa; + + size = da->size * (*da->nr); + print_fn_descriptor_symbol("per_cpu_dyna_array %s ", da->name); + printk(KERN_CONT "size:%#lx nr:%d align:%#lx", + da->size, *da->nr, da->align); + + phys = roundup(phys, da->align); + addr = (unsigned long)da->name; + addr += per_cpu_offset(cpu); + array = (void **)addr; + *array = phys_to_virt(phys); + *da->name = *array; /* so init_work could use it directly */ + printk(KERN_CONT " %p ==> [%#lx - %#lx]\n", array, phys, phys + size); + phys += size; + + if (da->init_work) { + da->init_work(da); + } + } +#endif +} + asmlinkage void __init start_kernel(void) { char * command_line; -- cgit v1.2.3-70-g09d2 From 1f8ff037a871690c762d267d8a052529d3102fc9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:49:45 -0700 Subject: x86: alloc dyn_array all together so could spare some memory with small alignment in bootmem also tighten the alignment checking, and make print out less debug info. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 16 +++++++---- include/linux/init.h | 2 +- init/main.c | 65 ++++++++++++++++++++++++++++++++---------- 3 files changed, 62 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 13ba7a83808..2b7dab699e8 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -140,26 +140,31 @@ static void __init setup_cpu_pda_map(void) */ void __init setup_per_cpu_areas(void) { - ssize_t size, old_size; + ssize_t size, old_size, da_size; char *ptr; int cpu; + unsigned long align = 1; /* Setup cpu_pda map */ setup_cpu_pda_map(); /* Copy section for each CPU (we discard the original) */ old_size = PERCPU_ENOUGH_ROOM; - size = old_size + per_cpu_dyn_array_size(); + da_size = per_cpu_dyn_array_size(&align); + align = max_t(unsigned long, PAGE_SIZE, align); + size = roundup(old_size + da_size, align); printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n", size); for_each_possible_cpu(cpu) { #ifndef CONFIG_NEED_MULTIPLE_NODES - ptr = alloc_bootmem_pages(size); + ptr = __alloc_bootmem(size, align, + __pa(MAX_DMA_ADDRESS)); #else int node = early_cpu_to_node(cpu); if (!node_online(node) || !NODE_DATA(node)) { - ptr = alloc_bootmem_pages(size); + ptr = __alloc_bootmem(size, align, + __pa(MAX_DMA_ADDRESS)); printk(KERN_INFO "cpu %d has no node %d or node-local memory\n", cpu, node); @@ -168,7 +173,8 @@ void __init setup_per_cpu_areas(void) cpu, __pa(ptr)); } else { - ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); + ptr = __alloc_bootmem_node(NODE_DATA(node), size, align, + __pa(MAX_DMA_ADDRESS)); if (ptr) printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n", cpu, node, __pa(ptr)); diff --git a/include/linux/init.h b/include/linux/init.h index 332806826b8..59fbb4aaba6 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -288,7 +288,7 @@ extern struct dyn_array *__per_cpu_dyn_array_start[], *__per_cpu_dyn_array_end[] DEFINE_PER_CPU_DYN_ARRAY_ADDR(nameX, nameX, nrX, alignX, init_workX) extern void pre_alloc_dyn_array(void); -extern unsigned long per_cpu_dyn_array_size(void); +extern unsigned long per_cpu_dyn_array_size(unsigned long *align); extern void per_cpu_alloc_dyn_array(int cpu, char *ptr); #endif /* __ASSEMBLY__ */ diff --git a/init/main.c b/init/main.c index 416bca4f734..ab97d0877ac 100644 --- a/init/main.c +++ b/init/main.c @@ -394,10 +394,14 @@ static void __init setup_per_cpu_areas(void) unsigned long size, i, old_size; char *ptr; unsigned long nr_possible_cpus = num_possible_cpus(); + unsigned long align = 1; + unsigned da_size; /* Copy section for each CPU (we discard the original) */ old_size = PERCPU_ENOUGH_ROOM; - size = ALIGN(old_size + per_cpu_dyn_array_size(), PAGE_SIZE); + da_size = per_cpu_dyn_array_size(&align); + align = max_t(unsigned long, PAGE_SIZE, align); + size = ALIGN(old_size + da_size, align); ptr = alloc_bootmem_pages(size * nr_possible_cpus); for_each_possible_cpu(i) { @@ -541,45 +545,78 @@ void __init __weak thread_info_cache_init(void) void pre_alloc_dyn_array(void) { #ifdef CONFIG_HAVE_DYN_ARRAY - unsigned long size, phys = 0; + unsigned long total_size = 0, size, phys; + unsigned long max_align = 1; struct dyn_array **daa; + char *ptr; + /* get the total size at first */ for (daa = __dyn_array_start ; daa < __dyn_array_end; daa++) { struct dyn_array *da = *daa; size = da->size * (*da->nr); - print_fn_descriptor_symbol("dyna_array %s ", da->name); - printk(KERN_CONT "size:%#lx nr:%d align:%#lx", + print_fn_descriptor_symbol("dyn_array %s ", da->name); + printk(KERN_CONT "size:%#lx nr:%d align:%#lx\n", da->size, *da->nr, da->align); - *da->name = __alloc_bootmem(size, da->align, phys); - phys = virt_to_phys(*da->name); + total_size += roundup(size, da->align); + if (da->align > max_align) + max_align = da->align; + } + if (total_size) + printk(KERN_DEBUG "dyn_array total_size: %#lx\n", + total_size); + else + return; + + /* allocate them all together */ + max_align = max_t(unsigned long, max_align, PAGE_SIZE); + ptr = __alloc_bootmem_nopanic(total_size, max_align, 0); + if (!ptr) + panic("Can not alloc dyn_alloc\n"); + + phys = virt_to_phys(ptr); + for (daa = __dyn_array_start ; daa < __dyn_array_end; daa++) { + struct dyn_array *da = *daa; + + size = da->size * (*da->nr); + print_fn_descriptor_symbol("dyn_array %s ", da->name); + + phys = roundup(phys, da->align); + *da->name = phys_to_virt(phys); printk(KERN_CONT " ==> [%#lx - %#lx]\n", phys, phys + size); + phys += size; + if (da->init_work) da->init_work(da); } #endif } -unsigned long per_cpu_dyn_array_size(void) +unsigned long per_cpu_dyn_array_size(unsigned long *align) { unsigned long total_size = 0; #ifdef CONFIG_HAVE_DYN_ARRAY unsigned long size; struct dyn_array **daa; + unsigned max_align = 1; for (daa = __per_cpu_dyn_array_start ; daa < __per_cpu_dyn_array_end; daa++) { struct dyn_array *da = *daa; size = da->size * (*da->nr); - print_fn_descriptor_symbol("per_cpu_dyna_array %s ", da->name); + print_fn_descriptor_symbol("per_cpu_dyn_array %s ", da->name); printk(KERN_CONT "size:%#lx nr:%d align:%#lx\n", da->size, *da->nr, da->align); total_size += roundup(size, da->align); + if (da->align > max_align) + max_align = da->align; } - if (total_size) - printk(KERN_DEBUG "per_cpu_dyna_array total_size: %#lx\n", + if (total_size) { + printk(KERN_DEBUG "per_cpu_dyn_array total_size: %#lx\n", total_size); + *align = max_align; + } #endif return total_size; } @@ -593,14 +630,11 @@ void per_cpu_alloc_dyn_array(int cpu, char *ptr) void **array; phys = virt_to_phys(ptr); - for (daa = __per_cpu_dyn_array_start ; daa < __per_cpu_dyn_array_end; daa++) { struct dyn_array *da = *daa; size = da->size * (*da->nr); - print_fn_descriptor_symbol("per_cpu_dyna_array %s ", da->name); - printk(KERN_CONT "size:%#lx nr:%d align:%#lx", - da->size, *da->nr, da->align); + print_fn_descriptor_symbol("per_cpu_dyn_array %s ", da->name); phys = roundup(phys, da->align); addr = (unsigned long)da->name; @@ -608,7 +642,8 @@ void per_cpu_alloc_dyn_array(int cpu, char *ptr) array = (void **)addr; *array = phys_to_virt(phys); *da->name = *array; /* so init_work could use it directly */ - printk(KERN_CONT " %p ==> [%#lx - %#lx]\n", array, phys, phys + size); + printk(KERN_CONT " ==> [%#lx - %#lx]\n", phys, phys + size); + phys += size; if (da->init_work) { -- cgit v1.2.3-70-g09d2 From 6da55c3e8da88e8a7cb6452160776ad6706798ad Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:49:46 -0700 Subject: x86: enable dyn_array support Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/Kconfig | 2 ++ arch/x86/Kconfig | 1 + arch/x86/kernel/vmlinux_32.lds.S | 1 + arch/x86/kernel/vmlinux_64.lds.S | 3 +++ 4 files changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/Kconfig b/arch/Kconfig index 0267babe5eb..c1f9febb404 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -103,3 +103,5 @@ config HAVE_CLK The calls support software clock gating and thus are a key power management tool on many systems. +config HAVE_DYN_ARRAY + def_bool n diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f65c2744d57..42f98009d75 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -33,6 +33,7 @@ config X86 select HAVE_ARCH_TRACEHOOK select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_EFFICIENT_UNALIGNED_ACCESS + select HAVE_DYN_ARRAY config ARCH_DEFCONFIG string diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index a9b8560adbc..c36007ab394 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -145,6 +145,7 @@ SECTIONS *(.x86_cpu_dev.init) __x86_cpu_dev_end = .; } + DYN_ARRAY_INIT(8) SECURITY_INIT . = ALIGN(4); .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 46e05447405..30973dbac8c 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -173,6 +173,9 @@ SECTIONS *(.x86_cpu_dev.init) } __x86_cpu_dev_end = .; + + DYN_ARRAY_INIT(8) + SECURITY_INIT . = ALIGN(8); -- cgit v1.2.3-70-g09d2 From 0799e432acfda879eaeef9622426bfa1434f3786 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:49:48 -0700 Subject: x86: use nr_irqs also add first_free_entry and pin_map_size, which were NR_IRQS derived constants. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_32.c | 26 ++++++++++++++------------ arch/x86/kernel/io_apic_64.c | 33 +++++++++++++++++---------------- arch/x86/kernel/irq_32.c | 8 ++++---- arch/x86/kernel/irq_64.c | 8 ++++---- arch/x86/kernel/irqinit_32.c | 2 +- arch/x86/kernel/irqinit_64.c | 2 +- include/asm-x86/irq.h | 3 +++ 7 files changed, 44 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index e710289f673..d382990244f 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -70,6 +70,7 @@ int timer_through_8259 __initdata; */ int sis_apic_bug = -1; +int first_free_entry = NR_IRQS; /* * # of IRQ routing registers */ @@ -100,6 +101,8 @@ static int disable_timer_pin_1 __initdata; #define MAX_PLUS_SHARED_IRQS NR_IRQS #define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) +int pin_map_size = PIN_MAP_SIZE; + /* * This is performance-critical, we want to do it O(1) * @@ -213,7 +216,6 @@ static void ioapic_mask_entry(int apic, int pin) */ static void add_pin_to_irq(unsigned int irq, int apic, int pin) { - static int first_free_entry = NR_IRQS; struct irq_pin_list *entry = irq_2_pin + irq; while (entry->next) @@ -222,7 +224,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin) if (entry->pin != -1) { entry->next = first_free_entry; entry = irq_2_pin + entry->next; - if (++first_free_entry >= PIN_MAP_SIZE) + if (++first_free_entry >= pin_map_size) panic("io_apic.c: whoops"); } entry->apic = apic; @@ -457,7 +459,7 @@ static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) int i, j; for_each_online_cpu(i) { - for (j = 0; j < NR_IRQS; j++) { + for (j = 0; j < nr_irqs; j++) { if (!irq_desc[j].action) continue; /* Is it a significant load ? */ @@ -492,7 +494,7 @@ static void do_irq_balance(void) if (!cpu_online(i)) continue; package_index = CPU_TO_PACKAGEINDEX(i); - for (j = 0; j < NR_IRQS; j++) { + for (j = 0; j < nr_irqs; j++) { unsigned long value_now, delta; /* Is this an active IRQ or balancing disabled ? */ if (!irq_desc[j].action || irq_balancing_disabled(j)) @@ -587,7 +589,7 @@ tryanotherirq: */ move_this_load = 0; selected_irq = -1; - for (j = 0; j < NR_IRQS; j++) { + for (j = 0; j < nr_irqs; j++) { /* Is this an active IRQ? */ if (!irq_desc[j].action) continue; @@ -664,7 +666,7 @@ static int balanced_irq(void *unused) long time_remaining = balanced_irq_interval; /* push everything to CPU 0 to give us a starting point. */ - for (i = 0 ; i < NR_IRQS ; i++) { + for (i = 0 ; i < nr_irqs ; i++) { irq_desc[i].pending_mask = cpumask_of_cpu(0); set_pending_irq(i, cpumask_of_cpu(0)); } @@ -712,8 +714,8 @@ static int __init balanced_irq_init(void) physical_balance = 1; for_each_online_cpu(i) { - irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); - irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); + irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * nr_irqs, GFP_KERNEL); + irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * nr_irqs, GFP_KERNEL); if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) { printk(KERN_ERR "balanced_irq_init: out of memory"); goto failed; @@ -1441,7 +1443,7 @@ __apicdebuginit(void) print_IO_APIC(void) } } printk(KERN_DEBUG "IRQ to pin mappings:\n"); - for (i = 0; i < NR_IRQS; i++) { + for (i = 0; i < nr_irqs; i++) { struct irq_pin_list *entry = irq_2_pin + i; if (entry->pin < 0) continue; @@ -1621,7 +1623,7 @@ static void __init enable_IO_APIC(void) int i, apic; unsigned long flags; - for (i = 0; i < PIN_MAP_SIZE; i++) { + for (i = 0; i < pin_map_size; i++) { irq_2_pin[i].pin = -1; irq_2_pin[i].next = 0; } @@ -2005,7 +2007,7 @@ static inline void init_IO_APIC_traps(void) * Also, we've got to be careful not to trash gate * 0x80, because int 0x80 is hm, kind of importantish. ;) */ - for (irq = 0; irq < NR_IRQS ; irq++) { + for (irq = 0; irq < nr_irqs ; irq++) { if (IO_APIC_IRQ(irq) && !irq_vector[irq]) { /* * Hmm.. We don't have an entry for this, @@ -2449,7 +2451,7 @@ int create_irq(void) irq = -ENOSPC; spin_lock_irqsave(&vector_lock, flags); - for (new = (NR_IRQS - 1); new >= 0; new--) { + for (new = (nr_irqs - 1); new >= 0; new--) { if (platform_legacy_irq(new)) continue; if (irq_vector[new] != 0) diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 02063ae042f..448384c7c1e 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -132,6 +132,7 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); #define MAX_PLUS_SHARED_IRQS NR_IRQS #define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) +int pin_map_size = PIN_MAP_SIZE; /* * This is performance-critical, we want to do it O(1) * @@ -224,7 +225,7 @@ static inline void io_apic_sync(unsigned int apic) int pin; \ struct irq_pin_list *entry = irq_2_pin + irq; \ \ - BUG_ON(irq >= NR_IRQS); \ + BUG_ON(irq >= nr_irqs); \ for (;;) { \ unsigned int reg; \ pin = entry->pin; \ @@ -301,7 +302,7 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) int apic, pin; struct irq_pin_list *entry = irq_2_pin + irq; - BUG_ON(irq >= NR_IRQS); + BUG_ON(irq >= nr_irqs); for (;;) { unsigned int reg; apic = entry->apic; @@ -358,19 +359,19 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ +int first_free_entry = NR_IRQS; static void add_pin_to_irq(unsigned int irq, int apic, int pin) { - static int first_free_entry = NR_IRQS; struct irq_pin_list *entry = irq_2_pin + irq; - BUG_ON(irq >= NR_IRQS); + BUG_ON(irq >= nr_irqs); while (entry->next) entry = irq_2_pin + entry->next; if (entry->pin != -1) { entry->next = first_free_entry; entry = irq_2_pin + entry->next; - if (++first_free_entry >= PIN_MAP_SIZE) + if (++first_free_entry >= pin_map_size) panic("io_apic.c: ran out of irq_2_pin entries!"); } entry->apic = apic; @@ -634,7 +635,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) best_guess = irq; } } - BUG_ON(best_guess >= NR_IRQS); + BUG_ON(best_guess >= nr_irqs); return best_guess; } @@ -766,7 +767,7 @@ static int pin_2_irq(int idx, int apic, int pin) irq += nr_ioapic_registers[i++]; irq += pin; } - BUG_ON(irq >= NR_IRQS); + BUG_ON(irq >= nr_irqs); return irq; } @@ -801,7 +802,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask) int cpu; struct irq_cfg *cfg; - BUG_ON((unsigned)irq >= NR_IRQS); + BUG_ON((unsigned)irq >= nr_irqs); cfg = &irq_cfg[irq]; /* Only try and allocate irqs on cpus that are present */ @@ -875,7 +876,7 @@ static void __clear_irq_vector(int irq) cpumask_t mask; int cpu, vector; - BUG_ON((unsigned)irq >= NR_IRQS); + BUG_ON((unsigned)irq >= nr_irqs); cfg = &irq_cfg[irq]; BUG_ON(!cfg->vector); @@ -895,7 +896,7 @@ void __setup_vector_irq(int cpu) int irq, vector; /* Mark the inuse vectors */ - for (irq = 0; irq < NR_IRQS; ++irq) { + for (irq = 0; irq < nr_irqs; ++irq) { if (!cpu_isset(cpu, irq_cfg[irq].domain)) continue; vector = irq_cfg[irq].vector; @@ -1193,7 +1194,7 @@ __apicdebuginit(void) print_IO_APIC(void) } } printk(KERN_DEBUG "IRQ to pin mappings:\n"); - for (i = 0; i < NR_IRQS; i++) { + for (i = 0; i < nr_irqs; i++) { struct irq_pin_list *entry = irq_2_pin + i; if (entry->pin < 0) continue; @@ -1366,7 +1367,7 @@ void __init enable_IO_APIC(void) int i, apic; unsigned long flags; - for (i = 0; i < PIN_MAP_SIZE; i++) { + for (i = 0; i < pin_map_size; i++) { irq_2_pin[i].pin = -1; irq_2_pin[i].next = 0; } @@ -1658,7 +1659,7 @@ static void ir_irq_migration(struct work_struct *work) { int irq; - for (irq = 0; irq < NR_IRQS; irq++) { + for (irq = 0; irq < nr_irqs; irq++) { struct irq_desc *desc = irq_desc + irq; if (desc->status & IRQ_MOVE_PENDING) { unsigned long flags; @@ -1707,7 +1708,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) struct irq_desc *desc; struct irq_cfg *cfg; irq = __get_cpu_var(vector_irq)[vector]; - if (irq >= NR_IRQS) + if (irq >= nr_irqs) continue; desc = irq_desc + irq; @@ -1865,7 +1866,7 @@ static inline void init_IO_APIC_traps(void) * Also, we've got to be careful not to trash gate * 0x80, because int 0x80 is hm, kind of importantish. ;) */ - for (irq = 0; irq < NR_IRQS ; irq++) { + for (irq = 0; irq < nr_irqs ; irq++) { if (IO_APIC_IRQ(irq) && !irq_cfg[irq].vector) { /* * Hmm.. We don't have an entry for this, @@ -2279,7 +2280,7 @@ int create_irq(void) irq = -ENOSPC; spin_lock_irqsave(&vector_lock, flags); - for (new = (NR_IRQS - 1); new >= 0; new--) { + for (new = (nr_irqs - 1); new >= 0; new--) { if (platform_legacy_irq(new)) continue; if (irq_cfg[new].vector != 0) diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index b71e02d42f4..4c7ffb32854 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -226,7 +226,7 @@ unsigned int do_IRQ(struct pt_regs *regs) int overflow, irq = ~regs->orig_ax; struct irq_desc *desc = irq_desc + irq; - if (unlikely((unsigned)irq >= NR_IRQS)) { + if (unlikely((unsigned)irq >= nr_irqs)) { printk(KERN_EMERG "%s: cannot handle IRQ %d\n", __func__, irq); BUG(); @@ -271,7 +271,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); } - if (i < NR_IRQS) { + if (i < nr_irqs) { unsigned any_count = 0; spin_lock_irqsave(&irq_desc[i].lock, flags); @@ -303,7 +303,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); skip: spin_unlock_irqrestore(&irq_desc[i].lock, flags); - } else if (i == NR_IRQS) { + } else if (i == nr_irqs) { seq_printf(p, "NMI: "); for_each_online_cpu(j) seq_printf(p, "%10u ", nmi_count(j)); @@ -396,7 +396,7 @@ void fixup_irqs(cpumask_t map) unsigned int irq; static int warned; - for (irq = 0; irq < NR_IRQS; irq++) { + for (irq = 0; irq < nr_irqs; irq++) { cpumask_t mask; if (irq == 2) continue; diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index f065fe9071b..e1f0839430d 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -81,7 +81,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); } - if (i < NR_IRQS) { + if (i < nr_irqs) { unsigned any_count = 0; spin_lock_irqsave(&irq_desc[i].lock, flags); @@ -112,7 +112,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); skip: spin_unlock_irqrestore(&irq_desc[i].lock, flags); - } else if (i == NR_IRQS) { + } else if (i == nr_irqs) { seq_printf(p, "NMI: "); for_each_online_cpu(j) seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count); @@ -201,7 +201,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs) stack_overflow_check(regs); #endif - if (likely(irq < NR_IRQS)) + if (likely(irq < nr_irqs)) generic_handle_irq(irq); else { if (!disable_apic) @@ -224,7 +224,7 @@ void fixup_irqs(cpumask_t map) unsigned int irq; static int warned; - for (irq = 0; irq < NR_IRQS; irq++) { + for (irq = 0; irq < nr_irqs; irq++) { cpumask_t mask; int break_affinity = 0; int set_affinity = 1; diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 9200a1e2752..65c1c950770 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -100,7 +100,7 @@ void __init native_init_IRQ(void) */ for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { int vector = FIRST_EXTERNAL_VECTOR + i; - if (i >= NR_IRQS) + if (i >= nr_irqs) break; /* SYSCALL_VECTOR was reserved in trap_init. */ if (!test_bit(vector, used_vectors)) diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 5b5be9d43c2..165c5d9b0d1 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -142,7 +142,7 @@ void __init init_ISA_irqs(void) init_bsp_APIC(); init_8259A(0); - for (i = 0; i < NR_IRQS; i++) { + for (i = 0; i < nr_irqs; i++) { irq_desc[i].status = IRQ_DISABLED; irq_desc[i].action = NULL; irq_desc[i].depth = 1; diff --git a/include/asm-x86/irq.h b/include/asm-x86/irq.h index 1e5f2909c1d..2a130c44f5d 100644 --- a/include/asm-x86/irq.h +++ b/include/asm-x86/irq.h @@ -10,6 +10,9 @@ #include #include +extern int pin_map_size; +extern int first_free_entry; + static inline int irq_canonicalize(int irq) { return ((irq == 2) ? 9 : irq); -- cgit v1.2.3-70-g09d2 From 301e619020dd67bde7e7e64bb9ffb7f30d26c979 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:02 -0700 Subject: x86: use dyn_array in io_apic_xx.c Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_32.c | 54 +++++++++++++++++++++++++++++++++++--------- arch/x86/kernel/io_apic_64.c | 28 +++++++++++++++++------ arch/x86/kernel/setup.c | 6 +++++ 3 files changed, 70 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index d382990244f..7f2bcc3dad8 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -70,7 +70,7 @@ int timer_through_8259 __initdata; */ int sis_apic_bug = -1; -int first_free_entry = NR_IRQS; +int first_free_entry; /* * # of IRQ routing registers */ @@ -98,10 +98,7 @@ static int disable_timer_pin_1 __initdata; * Rough estimation of how many shared IRQs there are, can * be changed anytime. */ -#define MAX_PLUS_SHARED_IRQS NR_IRQS -#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) - -int pin_map_size = PIN_MAP_SIZE; +int pin_map_size; /* * This is performance-critical, we want to do it O(1) @@ -112,7 +109,9 @@ int pin_map_size = PIN_MAP_SIZE; static struct irq_pin_list { int apic, pin, next; -} irq_2_pin[PIN_MAP_SIZE]; +} *irq_2_pin; + +DEFINE_DYN_ARRAY(irq_2_pin, sizeof(struct irq_pin_list), pin_map_size, 16, NULL); struct io_apic { unsigned int index; @@ -403,9 +402,28 @@ static struct irq_cpu_info { #define CPU_TO_PACKAGEINDEX(i) (first_cpu(per_cpu(cpu_sibling_map, i))) -static cpumask_t balance_irq_affinity[NR_IRQS] = { - [0 ... NR_IRQS-1] = CPU_MASK_ALL -}; +static cpumask_t balance_irq_affinity_init __initdata = CPU_MASK_ALL; + +static cpumask_t *balance_irq_affinity; + + +static void __init irq_affinity_init_work(void *data) +{ + struct dyn_array *da = data; + + int i; + struct balance_irq_affinity *affinity; + + affinity = *da->name; + + for (i = 0; i < *da->nr; i++) + memcpy(&affinity[i], &balance_irq_affinity_init, + sizeof(struct balance_irq_affinity)); + +} + +DEFINE_DYN_ARRAY(balance_irq_affinity, sizeof(struct balance_irq_affinity), nr_irqs, PAGE_SIZE, irq_affinity_init_work); + void set_balance_irq_affinity(unsigned int irq, cpumask_t mask) { @@ -1170,14 +1188,28 @@ static inline int IO_APIC_irq_trigger(int irq) } /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ -static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 }; +static u8 irq_vector_init_first __initdata = FIRST_DEVICE_VECTOR; +static u8 *irq_vector; + +static void __init irq_vector_init_work(void *data) +{ + struct dyn_array *da = data; + + u8 *irq_vec; + + irq_vec = *da->name; + + irq_vec[0] = irq_vector_init_first; +} + +DEFINE_DYN_ARRAY(irq_vector, sizeof(u8), nr_irqs, PAGE_SIZE, irq_vector_init_work); static int __assign_irq_vector(int irq) { static int current_vector = FIRST_DEVICE_VECTOR, current_offset; int vector, offset; - BUG_ON((unsigned)irq >= NR_IRQ_VECTORS); + BUG_ON((unsigned)irq >= nr_irqs); if (irq_vector[irq] > 0) return irq_vector[irq]; diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 448384c7c1e..93a3ffabfe6 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -66,7 +66,7 @@ struct irq_cfg { }; /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ -static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = { +static struct irq_cfg irq_cfg_legacy[] __initdata = { [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, @@ -85,6 +85,17 @@ static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = { [15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, }; +static struct irq_cfg *irq_cfg; + +static void __init init_work(void *data) +{ + struct dyn_array *da = data; + + memcpy(*da->name, irq_cfg_legacy, sizeof(irq_cfg_legacy)); +} + +DEFINE_DYN_ARRAY(irq_cfg, sizeof(struct irq_cfg), nr_irqs, PAGE_SIZE, init_work); + static int assign_irq_vector(int irq, cpumask_t mask); int first_system_vector = 0xfe; @@ -129,10 +140,9 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); * Rough estimation of how many shared IRQs there are, can * be changed anytime. */ -#define MAX_PLUS_SHARED_IRQS NR_IRQS -#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) -int pin_map_size = PIN_MAP_SIZE; +int pin_map_size; + /* * This is performance-critical, we want to do it O(1) * @@ -141,8 +151,12 @@ int pin_map_size = PIN_MAP_SIZE; */ static struct irq_pin_list { - short apic, pin, next; -} irq_2_pin[PIN_MAP_SIZE]; + short apic, pin; + int next; +} *irq_2_pin; + +DEFINE_DYN_ARRAY(irq_2_pin, sizeof(struct irq_pin_list), pin_map_size, sizeof(struct irq_pin_list), NULL); + struct io_apic { unsigned int index; @@ -359,7 +373,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -int first_free_entry = NR_IRQS; +int first_free_entry; static void add_pin_to_irq(unsigned int irq, int apic, int pin) { struct irq_pin_list *entry = irq_2_pin + irq; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 2255782e8d4..558ec26b08e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1067,9 +1067,15 @@ void __init setup_arch(char **cmdline_p) #endif prefill_possible_map(); + #ifdef CONFIG_X86_64 + /* need to wait for nr_cpu_ids settle down */ + if (nr_irqs == NR_IRQS) + nr_irqs = 32 * nr_cpu_ids + 224; init_cpu_to_node(); #endif + pin_map_size = nr_irqs * 2; + first_free_entry = nr_irqs; init_apic_mappings(); ioapic_init_mappings(); -- cgit v1.2.3-70-g09d2 From a84488c213a8cfc29200344a6fb6357d48c8ed85 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 19 Aug 2008 20:50:31 -0700 Subject: irq: sparse irqs, fix #3 fix non-APIC UP build: arch/x86/kernel/built-in.o: In function `setup_arch': : undefined reference to `pin_map_size' arch/x86/kernel/built-in.o: In function `setup_arch': : undefined reference to `first_free_entry' Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 558ec26b08e..02e3a669797 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1074,8 +1074,10 @@ void __init setup_arch(char **cmdline_p) nr_irqs = 32 * nr_cpu_ids + 224; init_cpu_to_node(); #endif +#ifdef CONFIG_X86_IO_APIC pin_map_size = nr_irqs * 2; first_free_entry = nr_irqs; +#endif init_apic_mappings(); ioapic_init_mappings(); -- cgit v1.2.3-70-g09d2 From 71f521bbaf375b685aeea20c6b0ed8600cd6edfe Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:03 -0700 Subject: x86, irq: get nr_irqs from madt Until now, NR_IRQS was derived from black magic defines that had to be "large enough" to both accomodate NR_CPUS and MAX_NR_IO_APICs. This resulted in a way too large irq_desc[] array on most x86 systems. Especially with larger CPU masks, the size of irq_desc can spiral out of control quickly. So be smarter about it and use precise allocation instead: determine the default maximum possible IRQ number from the ACPI MADT. Use a minimum limit of at least 32 IRQs for broken BIOSes. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 30 ++++++++++++++++++++++++++++-- include/asm-x86/mpspec.h | 1 + 2 files changed, 29 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index eb875cdc736..3e9d163fd92 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -957,6 +957,29 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) nr_ioapics++; } +int get_nr_irqs_via_madt(void) +{ + int idx; + int nr = 0; + + for (idx = 0; idx < nr_ioapics; idx++) { + if (mp_ioapic_routing[idx].gsi_end > nr) + nr = mp_ioapic_routing[idx].gsi_end; + } + + nr++; + + /* double it for hotplug and msi and nmi */ + nr <<= 1; + + /* something wrong ? */ + if (nr < 32) + nr = 32; + + return nr; + +} + static void assign_to_mp_irq(struct mp_config_intsrc *m, struct mp_config_intsrc *mp_irq) { @@ -1254,9 +1277,12 @@ static int __init acpi_parse_madt_ioapic_entries(void) return count; } + + nr_irqs = get_nr_irqs_via_madt(); + count = acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr, - NR_IRQ_VECTORS); + nr_irqs); if (count < 0) { printk(KERN_ERR PREFIX "Error parsing interrupt source overrides entry\n"); @@ -1276,7 +1302,7 @@ static int __init acpi_parse_madt_ioapic_entries(void) count = acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src, - NR_IRQ_VECTORS); + nr_irqs); if (count < 0) { printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n"); /* TBD: Cleanup to allow fallback to MPS */ diff --git a/include/asm-x86/mpspec.h b/include/asm-x86/mpspec.h index be2241a818f..a0748021250 100644 --- a/include/asm-x86/mpspec.h +++ b/include/asm-x86/mpspec.h @@ -60,6 +60,7 @@ extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi); extern void mp_config_acpi_legacy_irqs(void); extern int mp_register_gsi(u32 gsi, int edge_level, int active_high_low); +extern int get_nr_irqs_via_madt(void); #ifdef CONFIG_X86_IO_APIC extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, u32 gsi, int triggering, int polarity); -- cgit v1.2.3-70-g09d2 From 08678b0841267c1d00d771fe01548d86043d065e Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:05 -0700 Subject: generic: sparse irqs: use irq_desc() together with dyn_array, instead of irq_desc[] add CONFIG_HAVE_SPARSE_IRQ to for use condensed array. Get rid of irq_desc[] array assumptions. Preallocate 32 irq_desc, and irq_desc() will try to get more. ( No change in functionality is expected anywhere, except the odd build failure where we missed a code site or where a crossing commit itroduces new irq_desc[] usage. ) v2: according to Eric, change get_irq_desc() to irq_desc() Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/Kconfig | 4 ++ arch/x86/Kconfig | 1 + arch/x86/kernel/io_apic_32.c | 46 ++++++++---- arch/x86/kernel/io_apic_64.c | 75 +++++++++++++------- arch/x86/kernel/irq_32.c | 24 ++++--- arch/x86/kernel/irq_64.c | 35 ++++----- arch/x86/kernel/irqinit_64.c | 10 +-- arch/x86/kernel/visws_quirks.c | 30 ++++---- arch/x86/mach-voyager/voyager_smp.c | 4 +- drivers/gpio/gpiolib.c | 2 +- drivers/mfd/asic3.c | 4 +- drivers/mfd/htc-egpio.c | 2 +- drivers/parisc/dino.c | 6 +- drivers/parisc/eisa.c | 4 +- drivers/parisc/gsc.c | 12 ++-- drivers/parisc/iosapic.c | 4 +- drivers/parisc/superio.c | 4 +- drivers/pcmcia/hd64465_ss.c | 12 +++- drivers/xen/events.c | 8 ++- include/linux/irq.h | 32 ++++++--- kernel/irq/autoprobe.c | 10 +-- kernel/irq/chip.c | 32 +++++---- kernel/irq/handle.c | 138 ++++++++++++++++++++++++++++++++---- kernel/irq/manage.c | 35 +++++---- kernel/irq/migration.c | 14 ++-- kernel/irq/proc.c | 36 +++++----- kernel/irq/resend.c | 2 +- kernel/irq/spurious.c | 5 +- 28 files changed, 404 insertions(+), 187 deletions(-) (limited to 'arch/x86') diff --git a/arch/Kconfig b/arch/Kconfig index c1f9febb404..b3676224626 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -105,3 +105,7 @@ config HAVE_CLK config HAVE_DYN_ARRAY def_bool n + +config HAVE_SPARSE_IRQ + def_bool n + diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 42f98009d75..1004888e9b1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -34,6 +34,7 @@ config X86 select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_DYN_ARRAY + select HAVE_SPARSE_IRQ if X86_64 config ARCH_DEFCONFIG string diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 7f2bcc3dad8..c2160cfdec9 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -345,6 +345,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) struct irq_pin_list *entry = irq_2_pin + irq; unsigned int apicid_value; cpumask_t tmp; + struct irq_desc *desc; cpus_and(tmp, cpumask, cpu_online_map); if (cpus_empty(tmp)) @@ -365,7 +366,8 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) break; entry = irq_2_pin + entry->next; } - irq_desc[irq].affinity = cpumask; + desc = irq_to_desc(irq); + desc->affinity = cpumask; spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -475,10 +477,12 @@ static inline void balance_irq(int cpu, int irq) static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) { int i, j; + struct irq_desc *desc; for_each_online_cpu(i) { for (j = 0; j < nr_irqs; j++) { - if (!irq_desc[j].action) + desc = irq_to_desc(j); + if (!desc->action) continue; /* Is it a significant load ? */ if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) < @@ -505,6 +509,7 @@ static void do_irq_balance(void) unsigned long tmp_cpu_irq; unsigned long imbalance = 0; cpumask_t allowed_mask, target_cpu_mask, tmp; + struct irq_desc *desc; for_each_possible_cpu(i) { int package_index; @@ -515,7 +520,8 @@ static void do_irq_balance(void) for (j = 0; j < nr_irqs; j++) { unsigned long value_now, delta; /* Is this an active IRQ or balancing disabled ? */ - if (!irq_desc[j].action || irq_balancing_disabled(j)) + desc = irq_to_desc(j); + if (!desc->action || irq_balancing_disabled(j)) continue; if (package_index == i) IRQ_DELTA(package_index, j) = 0; @@ -609,7 +615,8 @@ tryanotherirq: selected_irq = -1; for (j = 0; j < nr_irqs; j++) { /* Is this an active IRQ? */ - if (!irq_desc[j].action) + desc = irq_to_desc(j); + if (!desc->action) continue; if (imbalance <= IRQ_DELTA(max_loaded, j)) continue; @@ -682,10 +689,12 @@ static int balanced_irq(void *unused) int i; unsigned long prev_balance_time = jiffies; long time_remaining = balanced_irq_interval; + struct irq_desc *desc; /* push everything to CPU 0 to give us a starting point. */ for (i = 0 ; i < nr_irqs ; i++) { - irq_desc[i].pending_mask = cpumask_of_cpu(0); + desc = irq_to_desc(i); + desc->pending_mask = cpumask_of_cpu(0); set_pending_irq(i, cpumask_of_cpu(0)); } @@ -1254,13 +1263,16 @@ static struct irq_chip ioapic_chip; static void ioapic_register_intr(int irq, int vector, unsigned long trigger) { + struct irq_desc *desc; + + desc = irq_to_desc(irq); if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || trigger == IOAPIC_LEVEL) { - irq_desc[irq].status |= IRQ_LEVEL; + desc->status |= IRQ_LEVEL; set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_fasteoi_irq, "fasteoi"); } else { - irq_desc[irq].status &= ~IRQ_LEVEL; + desc->status &= ~IRQ_LEVEL; set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_edge_irq, "edge"); } @@ -2027,6 +2039,7 @@ static struct irq_chip ioapic_chip __read_mostly = { static inline void init_IO_APIC_traps(void) { int irq; + struct irq_desc *desc; /* * NOTE! The local APIC isn't very good at handling @@ -2048,9 +2061,11 @@ static inline void init_IO_APIC_traps(void) */ if (irq < 16) make_8259A_irq(irq); - else + else { + desc = irq_to_desc(irq); /* Strange. Oh, well.. */ - irq_desc[irq].chip = &no_irq_chip; + desc->chip = &no_irq_chip; + } } } } @@ -2089,7 +2104,10 @@ static struct irq_chip lapic_chip __read_mostly = { static void lapic_register_intr(int irq, int vector) { - irq_desc[irq].status &= ~IRQ_LEVEL; + struct irq_desc *desc; + + desc = irq_to_desc(irq); + desc->status &= ~IRQ_LEVEL; set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, "edge"); set_intr_gate(vector, interrupt[irq]); @@ -2556,6 +2574,7 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) unsigned int dest; cpumask_t tmp; int vector; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2575,7 +2594,8 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); write_msi_msg(irq, &msg); - irq_desc[irq].affinity = mask; + desc = irq_to_desc(irq); + desc->affinity = mask; } #endif /* CONFIG_SMP */ @@ -2649,6 +2669,7 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) { unsigned int dest; cpumask_t tmp; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2659,7 +2680,8 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) dest = cpu_mask_to_apicid(mask); target_ht_irq(irq, dest); - irq_desc[irq].affinity = mask; + desc = irq_to_desc(irq); + desc->affinity = mask; } #endif diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 93a3ffabfe6..cab5a25d81b 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -345,6 +345,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) unsigned long flags; unsigned int dest; cpumask_t tmp; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -361,9 +362,10 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) */ dest = SET_APIC_LOGICAL_ID(dest); + desc = irq_to_desc(irq); spin_lock_irqsave(&ioapic_lock, flags); __target_IO_APIC_irq(irq, dest, cfg->vector); - irq_desc[irq].affinity = mask; + desc->affinity = mask; spin_unlock_irqrestore(&ioapic_lock, flags); } #endif @@ -933,14 +935,17 @@ static struct irq_chip ir_ioapic_chip; static void ioapic_register_intr(int irq, unsigned long trigger) { + struct irq_desc *desc; + + desc = irq_to_desc(irq); if (trigger) - irq_desc[irq].status |= IRQ_LEVEL; + desc->status |= IRQ_LEVEL; else - irq_desc[irq].status &= ~IRQ_LEVEL; + desc->status &= ~IRQ_LEVEL; #ifdef CONFIG_INTR_REMAP if (irq_remapped(irq)) { - irq_desc[irq].status |= IRQ_MOVE_PCNTXT; + desc->status |= IRQ_MOVE_PCNTXT; if (trigger) set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, handle_fasteoi_irq, @@ -1596,10 +1601,10 @@ static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration); static void migrate_ioapic_irq(int irq, cpumask_t mask) { struct irq_cfg *cfg = irq_cfg + irq; - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc; cpumask_t tmp, cleanup_mask; struct irte irte; - int modify_ioapic_rte = desc->status & IRQ_LEVEL; + int modify_ioapic_rte; unsigned int dest; unsigned long flags; @@ -1616,6 +1621,8 @@ static void migrate_ioapic_irq(int irq, cpumask_t mask) cpus_and(tmp, cfg->domain, mask); dest = cpu_mask_to_apicid(tmp); + desc = irq_to_desc(irq); + modify_ioapic_rte = desc->status & IRQ_LEVEL; if (modify_ioapic_rte) { spin_lock_irqsave(&ioapic_lock, flags); __target_IO_APIC_irq(irq, dest, cfg->vector); @@ -1637,12 +1644,13 @@ static void migrate_ioapic_irq(int irq, cpumask_t mask) cfg->move_in_progress = 0; } - irq_desc[irq].affinity = mask; + desc->affinity = mask; } static int migrate_irq_remapped_level(int irq) { int ret = -1; + struct irq_desc *desc = irq_to_desc(irq); mask_IO_APIC_irq(irq); @@ -1658,11 +1666,11 @@ static int migrate_irq_remapped_level(int irq) } /* everthing is clear. we have right of way */ - migrate_ioapic_irq(irq, irq_desc[irq].pending_mask); + migrate_ioapic_irq(irq, desc->pending_mask); ret = 0; - irq_desc[irq].status &= ~IRQ_MOVE_PENDING; - cpus_clear(irq_desc[irq].pending_mask); + desc->status &= ~IRQ_MOVE_PENDING; + cpus_clear(desc->pending_mask); unmask: unmask_IO_APIC_irq(irq); @@ -1674,7 +1682,7 @@ static void ir_irq_migration(struct work_struct *work) int irq; for (irq = 0; irq < nr_irqs; irq++) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); if (desc->status & IRQ_MOVE_PENDING) { unsigned long flags; @@ -1686,8 +1694,7 @@ static void ir_irq_migration(struct work_struct *work) continue; } - desc->chip->set_affinity(irq, - irq_desc[irq].pending_mask); + desc->chip->set_affinity(irq, desc->pending_mask); spin_unlock_irqrestore(&desc->lock, flags); } } @@ -1698,9 +1705,11 @@ static void ir_irq_migration(struct work_struct *work) */ static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) { - if (irq_desc[irq].status & IRQ_LEVEL) { - irq_desc[irq].status |= IRQ_MOVE_PENDING; - irq_desc[irq].pending_mask = mask; + struct irq_desc *desc = irq_to_desc(irq); + + if (desc->status & IRQ_LEVEL) { + desc->status |= IRQ_MOVE_PENDING; + desc->pending_mask = mask; migrate_irq_remapped_level(irq); return; } @@ -1725,7 +1734,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) if (irq >= nr_irqs) continue; - desc = irq_desc + irq; + desc = irq_to_desc(irq); cfg = irq_cfg + irq; spin_lock(&desc->lock); if (!cfg->move_cleanup_count) @@ -1791,7 +1800,7 @@ static void ack_apic_level(unsigned int irq) irq_complete_move(irq); #ifdef CONFIG_GENERIC_PENDING_IRQ /* If we are moving the irq we need to mask it */ - if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) { + if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { do_unmask_irq = 1; mask_IO_APIC_irq(irq); } @@ -1868,6 +1877,7 @@ static struct irq_chip ir_ioapic_chip __read_mostly = { static inline void init_IO_APIC_traps(void) { int irq; + struct irq_desc *desc; /* * NOTE! The local APIC isn't very good at handling @@ -1889,9 +1899,11 @@ static inline void init_IO_APIC_traps(void) */ if (irq < 16) make_8259A_irq(irq); - else + else { + desc = irq_to_desc(irq); /* Strange. Oh, well.. */ - irq_desc[irq].chip = &no_irq_chip; + desc->chip = &no_irq_chip; + } } } } @@ -1926,7 +1938,10 @@ static struct irq_chip lapic_chip __read_mostly = { static void lapic_register_intr(int irq) { - irq_desc[irq].status &= ~IRQ_LEVEL; + struct irq_desc *desc; + + desc = irq_to_desc(irq); + desc->status &= ~IRQ_LEVEL; set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, "edge"); } @@ -2402,6 +2417,7 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) struct msi_msg msg; unsigned int dest; cpumask_t tmp; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2421,7 +2437,8 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); write_msi_msg(irq, &msg); - irq_desc[irq].affinity = mask; + desc = irq_to_desc(irq); + desc->affinity = mask; } #ifdef CONFIG_INTR_REMAP @@ -2435,6 +2452,7 @@ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) unsigned int dest; cpumask_t tmp, cleanup_mask; struct irte irte; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2469,7 +2487,8 @@ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) cfg->move_in_progress = 0; } - irq_desc[irq].affinity = mask; + desc = irq_to_desc(irq); + desc->affinity = mask; } #endif #endif /* CONFIG_SMP */ @@ -2543,7 +2562,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) #ifdef CONFIG_INTR_REMAP if (irq_remapped(irq)) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); /* * irq migration in process context */ @@ -2655,6 +2674,7 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) struct msi_msg msg; unsigned int dest; cpumask_t tmp; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2674,7 +2694,8 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); dmar_msi_write(irq, &msg); - irq_desc[irq].affinity = mask; + desc = irq_to_desc(irq); + desc->affinity = mask; } #endif /* CONFIG_SMP */ @@ -2731,6 +2752,7 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) struct irq_cfg *cfg = irq_cfg + irq; unsigned int dest; cpumask_t tmp; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2743,7 +2765,8 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) dest = cpu_mask_to_apicid(tmp); target_ht_irq(irq, dest, cfg->vector); - irq_desc[irq].affinity = mask; + desc = irq_to_desc(irq); + desc->affinity = mask; } #endif diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 4c7ffb32854..ede513be517 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -224,7 +224,7 @@ unsigned int do_IRQ(struct pt_regs *regs) struct pt_regs *old_regs; /* high bit used in ret_from_ code */ int overflow, irq = ~regs->orig_ax; - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); if (unlikely((unsigned)irq >= nr_irqs)) { printk(KERN_EMERG "%s: cannot handle IRQ %d\n", @@ -273,15 +273,16 @@ int show_interrupts(struct seq_file *p, void *v) if (i < nr_irqs) { unsigned any_count = 0; + struct irq_desc *desc = irq_to_desc(i); - spin_lock_irqsave(&irq_desc[i].lock, flags); + spin_lock_irqsave(&desc->lock, flags); #ifndef CONFIG_SMP any_count = kstat_irqs(i); #else for_each_online_cpu(j) any_count |= kstat_cpu(j).irqs[i]; #endif - action = irq_desc[i].action; + action = desc->action; if (!action && !any_count) goto skip; seq_printf(p, "%3d: ",i); @@ -291,8 +292,8 @@ int show_interrupts(struct seq_file *p, void *v) for_each_online_cpu(j) seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); #endif - seq_printf(p, " %8s", irq_desc[i].chip->name); - seq_printf(p, "-%-8s", irq_desc[i].name); + seq_printf(p, " %8s", desc->chip->name); + seq_printf(p, "-%-8s", desc->name); if (action) { seq_printf(p, " %s", action->name); @@ -302,7 +303,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); skip: - spin_unlock_irqrestore(&irq_desc[i].lock, flags); + spin_unlock_irqrestore(&desc->lock, flags); } else if (i == nr_irqs) { seq_printf(p, "NMI: "); for_each_online_cpu(j) @@ -398,17 +399,20 @@ void fixup_irqs(cpumask_t map) for (irq = 0; irq < nr_irqs; irq++) { cpumask_t mask; + struct irq_desc *desc; + if (irq == 2) continue; - cpus_and(mask, irq_desc[irq].affinity, map); + desc = irq_to_desc(irq); + cpus_and(mask, desc->affinity, map); if (any_online_cpu(mask) == NR_CPUS) { printk("Breaking affinity for irq %i\n", irq); mask = map; } - if (irq_desc[irq].chip->set_affinity) - irq_desc[irq].chip->set_affinity(irq, mask); - else if (irq_desc[irq].action && !(warned++)) + if (desc->chip->set_affinity) + desc->chip->set_affinity(irq, mask); + else if (desc->action && !(warned++)) printk("Cannot set affinity for irq %i\n", irq); } diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index e1f0839430d..738eb65a924 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -83,15 +83,16 @@ int show_interrupts(struct seq_file *p, void *v) if (i < nr_irqs) { unsigned any_count = 0; + struct irq_desc *desc = irq_to_desc(i); - spin_lock_irqsave(&irq_desc[i].lock, flags); + spin_lock_irqsave(&desc->lock, flags); #ifndef CONFIG_SMP any_count = kstat_irqs(i); #else for_each_online_cpu(j) any_count |= kstat_cpu(j).irqs[i]; #endif - action = irq_desc[i].action; + action = desc->action; if (!action && !any_count) goto skip; seq_printf(p, "%3d: ",i); @@ -101,8 +102,8 @@ int show_interrupts(struct seq_file *p, void *v) for_each_online_cpu(j) seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); #endif - seq_printf(p, " %8s", irq_desc[i].chip->name); - seq_printf(p, "-%-8s", irq_desc[i].name); + seq_printf(p, " %8s", desc->chip->name); + seq_printf(p, "-%-8s", desc->name); if (action) { seq_printf(p, " %s", action->name); @@ -111,7 +112,7 @@ int show_interrupts(struct seq_file *p, void *v) } seq_putc(p, '\n'); skip: - spin_unlock_irqrestore(&irq_desc[i].lock, flags); + spin_unlock_irqrestore(&desc->lock, flags); } else if (i == nr_irqs) { seq_printf(p, "NMI: "); for_each_online_cpu(j) @@ -228,37 +229,39 @@ void fixup_irqs(cpumask_t map) cpumask_t mask; int break_affinity = 0; int set_affinity = 1; + struct irq_desc *desc; if (irq == 2) continue; + desc = irq_to_desc(irq); /* interrupt's are disabled at this point */ - spin_lock(&irq_desc[irq].lock); + spin_lock(&desc->lock); if (!irq_has_action(irq) || - cpus_equal(irq_desc[irq].affinity, map)) { - spin_unlock(&irq_desc[irq].lock); + cpus_equal(desc->affinity, map)) { + spin_unlock(&desc->lock); continue; } - cpus_and(mask, irq_desc[irq].affinity, map); + cpus_and(mask, desc->affinity, map); if (cpus_empty(mask)) { break_affinity = 1; mask = map; } - if (irq_desc[irq].chip->mask) - irq_desc[irq].chip->mask(irq); + if (desc->chip->mask) + desc->chip->mask(irq); - if (irq_desc[irq].chip->set_affinity) - irq_desc[irq].chip->set_affinity(irq, mask); + if (desc->chip->set_affinity) + desc->chip->set_affinity(irq, mask); else if (!(warned++)) set_affinity = 0; - if (irq_desc[irq].chip->unmask) - irq_desc[irq].chip->unmask(irq); + if (desc->chip->unmask) + desc->chip->unmask(irq); - spin_unlock(&irq_desc[irq].lock); + spin_unlock(&desc->lock); if (break_affinity && set_affinity) printk("Broke affinity for irq %i\n", irq); diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 165c5d9b0d1..0744b49b4d1 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -143,9 +143,11 @@ void __init init_ISA_irqs(void) init_8259A(0); for (i = 0; i < nr_irqs; i++) { - irq_desc[i].status = IRQ_DISABLED; - irq_desc[i].action = NULL; - irq_desc[i].depth = 1; + struct irq_desc *desc = irq_to_desc(i); + + desc->status = IRQ_DISABLED; + desc->action = NULL; + desc->depth = 1; if (i < 16) { /* @@ -157,7 +159,7 @@ void __init init_ISA_irqs(void) /* * 'high' PCI IRQs filled in on demand */ - irq_desc[i].chip = &no_irq_chip; + desc->chip = &no_irq_chip; } } } diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 61a97e616f7..9d85ab38443 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -484,10 +484,11 @@ static void disable_cobalt_irq(unsigned int irq) static unsigned int startup_cobalt_irq(unsigned int irq) { unsigned long flags; + struct irq_desc *desc = irq_to_desc(irq); spin_lock_irqsave(&cobalt_lock, flags); - if ((irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING))) - irq_desc[irq].status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING); + if ((desc->status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING))) + desc->status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING); enable_cobalt_irq(irq); spin_unlock_irqrestore(&cobalt_lock, flags); return 0; @@ -506,9 +507,10 @@ static void ack_cobalt_irq(unsigned int irq) static void end_cobalt_irq(unsigned int irq) { unsigned long flags; + struct irq_desc *desc = irq_to_desc(irq); spin_lock_irqsave(&cobalt_lock, flags); - if (!(irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS))) + if (!(desc->status & (IRQ_DISABLED | IRQ_INPROGRESS))) enable_cobalt_irq(irq); spin_unlock_irqrestore(&cobalt_lock, flags); } @@ -626,7 +628,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) spin_unlock_irqrestore(&i8259A_lock, flags); - desc = irq_desc + realirq; + desc = irq_to_desc(realirq); /* * handle this 'virtual interrupt' as a Cobalt one now. @@ -662,27 +664,29 @@ void init_VISWS_APIC_irqs(void) int i; for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) { - irq_desc[i].status = IRQ_DISABLED; - irq_desc[i].action = 0; - irq_desc[i].depth = 1; + struct irq_desc *desc = irq_to_desc(i); + + desc->status = IRQ_DISABLED; + desc->action = 0; + desc->depth = 1; if (i == 0) { - irq_desc[i].chip = &cobalt_irq_type; + desc->chip = &cobalt_irq_type; } else if (i == CO_IRQ_IDE0) { - irq_desc[i].chip = &cobalt_irq_type; + desc->chip = &cobalt_irq_type; } else if (i == CO_IRQ_IDE1) { - irq_desc[i].chip = &cobalt_irq_type; + desc->chip = &cobalt_irq_type; } else if (i == CO_IRQ_8259) { - irq_desc[i].chip = &piix4_master_irq_type; + desc->chip = &piix4_master_irq_type; } else if (i < CO_IRQ_APIC0) { - irq_desc[i].chip = &piix4_virtual_irq_type; + desc->chip = &piix4_virtual_irq_type; } else if (IS_CO_APIC(i)) { - irq_desc[i].chip = &cobalt_irq_type; + desc->chip = &cobalt_irq_type; } } diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c index 199a5f4a873..0f6e8a6523a 100644 --- a/arch/x86/mach-voyager/voyager_smp.c +++ b/arch/x86/mach-voyager/voyager_smp.c @@ -1483,7 +1483,7 @@ static void disable_local_vic_irq(unsigned int irq) * the interrupt off to another CPU */ static void before_handle_vic_irq(unsigned int irq) { - irq_desc_t *desc = irq_desc + irq; + irq_desc_t *desc = irq_to_desc(irq); __u8 cpu = smp_processor_id(); _raw_spin_lock(&vic_irq_lock); @@ -1518,7 +1518,7 @@ static void before_handle_vic_irq(unsigned int irq) /* Finish the VIC interrupt: basically mask */ static void after_handle_vic_irq(unsigned int irq) { - irq_desc_t *desc = irq_desc + irq; + irq_desc_t *desc = irq_to_desc(irq); _raw_spin_lock(&vic_irq_lock); { diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 8d2940517c9..572d372899d 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -1058,7 +1058,7 @@ static void gpiolib_dbg_show(struct seq_file *s, struct gpio_chip *chip) if (!is_out) { int irq = gpio_to_irq(gpio); - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); /* This races with request_irq(), set_irq_type(), * and set_irq_wake() ... but those are "rare". diff --git a/drivers/mfd/asic3.c b/drivers/mfd/asic3.c index ba5aa200827..e4c0db4dc7b 100644 --- a/drivers/mfd/asic3.c +++ b/drivers/mfd/asic3.c @@ -123,7 +123,7 @@ static void asic3_irq_demux(unsigned int irq, struct irq_desc *desc) irqnr = asic->irq_base + (ASIC3_GPIOS_PER_BANK * bank) + i; - desc = irq_desc + irqnr; + desc = irq_to_desc(irqnr); desc->handle_irq(irqnr, desc); if (asic->irq_bothedge[bank] & bit) asic3_irq_flip_edge(asic, base, @@ -136,7 +136,7 @@ static void asic3_irq_demux(unsigned int irq, struct irq_desc *desc) for (i = ASIC3_NUM_GPIOS; i < ASIC3_NR_IRQS; i++) { /* They start at bit 4 and go up */ if (status & (1 << (i - ASIC3_NUM_GPIOS + 4))) { - desc = irq_desc + asic->irq_base + i; + desc = irq_to_desc(asic->irq_base + i); desc->handle_irq(asic->irq_base + i, desc); } diff --git a/drivers/mfd/htc-egpio.c b/drivers/mfd/htc-egpio.c index 6be43172dc6..ad3379fcd19 100644 --- a/drivers/mfd/htc-egpio.c +++ b/drivers/mfd/htc-egpio.c @@ -112,7 +112,7 @@ static void egpio_handler(unsigned int irq, struct irq_desc *desc) /* Run irq handler */ pr_debug("got IRQ %d\n", irqpin); irq = ei->irq_start + irqpin; - desc = &irq_desc[irq]; + desc = irq_to_desc(irq); desc->handle_irq(irq, desc); } } diff --git a/drivers/parisc/dino.c b/drivers/parisc/dino.c index fd56128525d..3bc54b30c3a 100644 --- a/drivers/parisc/dino.c +++ b/drivers/parisc/dino.c @@ -298,7 +298,8 @@ struct pci_port_ops dino_port_ops = { static void dino_disable_irq(unsigned int irq) { - struct dino_device *dino_dev = irq_desc[irq].chip_data; + struct irq_desc *desc = irq_to_desc(irq); + struct dino_device *dino_dev = desc->chip_data; int local_irq = gsc_find_local_irq(irq, dino_dev->global_irq, DINO_LOCAL_IRQS); DBG(KERN_WARNING "%s(0x%p, %d)\n", __func__, dino_dev, irq); @@ -310,7 +311,8 @@ static void dino_disable_irq(unsigned int irq) static void dino_enable_irq(unsigned int irq) { - struct dino_device *dino_dev = irq_desc[irq].chip_data; + struct irq_desc *desc = irq_to_desc(irq); + struct dino_device *dino_dev = desc->chip_data; int local_irq = gsc_find_local_irq(irq, dino_dev->global_irq, DINO_LOCAL_IRQS); u32 tmp; diff --git a/drivers/parisc/eisa.c b/drivers/parisc/eisa.c index 771cef59254..7891db50c48 100644 --- a/drivers/parisc/eisa.c +++ b/drivers/parisc/eisa.c @@ -346,10 +346,10 @@ static int __init eisa_probe(struct parisc_device *dev) } /* Reserve IRQ2 */ - irq_desc[2].action = &irq2_action; + irq_to_desc(2)->action = &irq2_action; for (i = 0; i < 16; i++) { - irq_desc[i].chip = &eisa_interrupt_type; + irq_to_desc(i)->chip = &eisa_interrupt_type; } EISA_bus = 1; diff --git a/drivers/parisc/gsc.c b/drivers/parisc/gsc.c index f7d088b897e..e76db9e4d50 100644 --- a/drivers/parisc/gsc.c +++ b/drivers/parisc/gsc.c @@ -108,7 +108,8 @@ int gsc_find_local_irq(unsigned int irq, int *global_irqs, int limit) static void gsc_asic_disable_irq(unsigned int irq) { - struct gsc_asic *irq_dev = irq_desc[irq].chip_data; + struct irq_desc *desc = irq_to_desc(irq); + struct gsc_asic *irq_dev = desc->chip_data; int local_irq = gsc_find_local_irq(irq, irq_dev->global_irq, 32); u32 imr; @@ -123,7 +124,8 @@ static void gsc_asic_disable_irq(unsigned int irq) static void gsc_asic_enable_irq(unsigned int irq) { - struct gsc_asic *irq_dev = irq_desc[irq].chip_data; + struct irq_desc *desc = irq_to_desc(irq); + struct gsc_asic *irq_dev = desc->chip_data; int local_irq = gsc_find_local_irq(irq, irq_dev->global_irq, 32); u32 imr; @@ -159,12 +161,14 @@ static struct hw_interrupt_type gsc_asic_interrupt_type = { int gsc_assign_irq(struct hw_interrupt_type *type, void *data) { static int irq = GSC_IRQ_BASE; + struct irq_desc *desc; if (irq > GSC_IRQ_MAX) return NO_IRQ; - irq_desc[irq].chip = type; - irq_desc[irq].chip_data = data; + desc = irq_to_desc(irq); + desc->chip = type; + desc->chip_data = data; return irq++; } diff --git a/drivers/parisc/iosapic.c b/drivers/parisc/iosapic.c index 6fb3f7979f2..7beffcab274 100644 --- a/drivers/parisc/iosapic.c +++ b/drivers/parisc/iosapic.c @@ -619,7 +619,9 @@ iosapic_set_irt_data( struct vector_info *vi, u32 *dp0, u32 *dp1) static struct vector_info *iosapic_get_vector(unsigned int irq) { - return irq_desc[irq].chip_data; + struct irq_desc *desc = irq_to_desc(irq); + + return desc->chip_data; } static void iosapic_disable_irq(unsigned int irq) diff --git a/drivers/parisc/superio.c b/drivers/parisc/superio.c index 1e8d2d17f04..1e93c837514 100644 --- a/drivers/parisc/superio.c +++ b/drivers/parisc/superio.c @@ -363,7 +363,9 @@ int superio_fixup_irq(struct pci_dev *pcidev) #endif for (i = 0; i < 16; i++) { - irq_desc[i].chip = &superio_interrupt_type; + struct irq_desc *desc = irq_to_desc(i); + + desc->chip = &superio_interrupt_type; } /* diff --git a/drivers/pcmcia/hd64465_ss.c b/drivers/pcmcia/hd64465_ss.c index 117dc12ab43..9ef69cdb318 100644 --- a/drivers/pcmcia/hd64465_ss.c +++ b/drivers/pcmcia/hd64465_ss.c @@ -233,15 +233,18 @@ static struct hw_interrupt_type hd64465_ss_irq_type = { */ static void hs_map_irq(hs_socket_t *sp, unsigned int irq) { + struct irq_desc *desc; + DPRINTK("hs_map_irq(sock=%d irq=%d)\n", sp->number, irq); if (irq >= HS_NUM_MAPPED_IRQS) return; + desc = irq_to_desc(irq); hs_mapped_irq[irq].sock = sp; /* insert ourselves as the irq controller */ - hs_mapped_irq[irq].old_handler = irq_desc[irq].chip; - irq_desc[irq].chip = &hd64465_ss_irq_type; + hs_mapped_irq[irq].old_handler = desc->chip; + desc->chip = &hd64465_ss_irq_type; } @@ -250,13 +253,16 @@ static void hs_map_irq(hs_socket_t *sp, unsigned int irq) */ static void hs_unmap_irq(hs_socket_t *sp, unsigned int irq) { + struct irq_desc *desc; + DPRINTK("hs_unmap_irq(sock=%d irq=%d)\n", sp->number, irq); if (irq >= HS_NUM_MAPPED_IRQS) return; + desc = irq_to_desc(irq); /* restore the original irq controller */ - irq_desc[irq].chip = hs_mapped_irq[irq].old_handler; + desc->chip = hs_mapped_irq[irq].old_handler; } /*============================================================*/ diff --git a/drivers/xen/events.c b/drivers/xen/events.c index ed8235187dc..56ace47f24d 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -125,7 +125,7 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) BUG_ON(irq == -1); #ifdef CONFIG_SMP - irq_desc[irq].affinity = cpumask_of_cpu(cpu); + irq_to_desc(irq)->affinity = cpumask_of_cpu(cpu); #endif __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]); @@ -139,8 +139,10 @@ static void init_evtchn_cpu_bindings(void) #ifdef CONFIG_SMP int i; /* By default all event channels notify CPU#0. */ - for (i = 0; i < nr_irqs; i++) - irq_desc[i].affinity = cpumask_of_cpu(0); + for (i = 0; i < nr_irqs; i++) { + struct irq_desc *desc = irq_to_desc(i); + desc->affinity = cpumask_of_cpu(0); + } #endif memset(cpu_evtchn, 0, sizeof(cpu_evtchn)); diff --git a/include/linux/irq.h b/include/linux/irq.h index 5f4b013624d..80b8200f2ad 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -152,6 +152,10 @@ struct irq_chip { * @name: flow handler name for /proc/interrupts output */ struct irq_desc { + unsigned int irq; +#ifdef CONFIG_HAVE_SPARSE_IRQ + struct irq_desc *next; +#endif irq_flow_handler_t handle_irq; struct irq_chip *chip; struct msi_desc *msi_desc; @@ -179,9 +183,9 @@ struct irq_desc { const char *name; } ____cacheline_internodealigned_in_smp; -#ifdef CONFIG_HAVE_DYN_ARRAY -extern struct irq_desc *irq_desc; -#else +extern struct irq_desc *irq_to_desc(unsigned int irq); +#ifndef CONFIG_HAVE_DYN_ARRAY +/* could be removed if we get rid of all irq_desc reference */ extern struct irq_desc irq_desc[NR_IRQS]; #endif @@ -249,7 +253,10 @@ extern int no_irq_affinity; static inline int irq_balancing_disabled(unsigned int irq) { - return irq_desc[irq].status & IRQ_NO_BALANCING_MASK; + struct irq_desc *desc; + + desc = irq_to_desc(irq); + return desc->status & IRQ_NO_BALANCING_MASK; } /* Handle irq action chains: */ @@ -281,7 +288,7 @@ extern unsigned int __do_IRQ(unsigned int irq); */ static inline void generic_handle_irq(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); #ifdef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ desc->handle_irq(irq, desc); @@ -325,7 +332,10 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, static inline void __set_irq_handler_unlocked(int irq, irq_flow_handler_t handler) { - irq_desc[irq].handle_irq = handler; + struct irq_desc *desc; + + desc = irq_to_desc(irq); + desc->handle_irq = handler; } /* @@ -359,7 +369,7 @@ extern void destroy_irq(unsigned int irq); /* Test to see if a driver has successfully requested an irq */ static inline int irq_has_action(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); return desc->action != NULL; } @@ -374,10 +384,10 @@ extern int set_irq_chip_data(unsigned int irq, void *data); extern int set_irq_type(unsigned int irq, unsigned int type); extern int set_irq_msi(unsigned int irq, struct msi_desc *entry); -#define get_irq_chip(irq) (irq_desc[irq].chip) -#define get_irq_chip_data(irq) (irq_desc[irq].chip_data) -#define get_irq_data(irq) (irq_desc[irq].handler_data) -#define get_irq_msi(irq) (irq_desc[irq].msi_desc) +#define get_irq_chip(irq) (irq_to_desc(irq)->chip) +#define get_irq_chip_data(irq) (irq_to_desc(irq)->chip_data) +#define get_irq_data(irq) (irq_to_desc(irq)->handler_data) +#define get_irq_msi(irq) (irq_to_desc(irq)->msi_desc) #endif /* CONFIG_GENERIC_HARDIRQS */ diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index c689e9851a8..c45ab718cf0 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -39,7 +39,7 @@ unsigned long probe_irq_on(void) * flush such a longstanding irq before considering it as spurious. */ for (i = nr_irqs-1; i > 0; i--) { - desc = irq_desc + i; + desc = irq_to_desc(i); spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { @@ -69,7 +69,7 @@ unsigned long probe_irq_on(void) * happened in the previous stage, it may have masked itself) */ for (i = nr_irqs-1; i > 0; i--) { - desc = irq_desc + i; + desc = irq_to_desc(i); spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { @@ -92,7 +92,7 @@ unsigned long probe_irq_on(void) for (i = 0; i < nr_irqs; i++) { unsigned int status; - desc = irq_desc + i; + desc = irq_to_desc(i); spin_lock_irq(&desc->lock); status = desc->status; @@ -131,7 +131,7 @@ unsigned int probe_irq_mask(unsigned long val) mask = 0; for (i = 0; i < nr_irqs; i++) { - struct irq_desc *desc = irq_desc + i; + struct irq_desc *desc = irq_to_desc(i); unsigned int status; spin_lock_irq(&desc->lock); @@ -174,7 +174,7 @@ int probe_irq_off(unsigned long val) int i, irq_found = 0, nr_irqs = 0; for (i = 0; i < nr_irqs; i++) { - struct irq_desc *desc = irq_desc + i; + struct irq_desc *desc = irq_to_desc(i); unsigned int status; spin_lock_irq(&desc->lock); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index bba66e09870..76c225cf4b2 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -33,7 +33,7 @@ void dynamic_irq_init(unsigned int irq) } /* Ensure we don't have left over values from a previous use of this irq */ - desc = irq_desc + irq; + desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); desc->status = IRQ_DISABLED; desc->chip = &no_irq_chip; @@ -65,7 +65,7 @@ void dynamic_irq_cleanup(unsigned int irq) return; } - desc = irq_desc + irq; + desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); if (desc->action) { spin_unlock_irqrestore(&desc->lock, flags); @@ -100,7 +100,7 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip) if (!chip) chip = &no_irq_chip; - desc = irq_desc + irq; + desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); irq_chip_set_defaults(chip); desc->chip = chip; @@ -126,7 +126,7 @@ int set_irq_type(unsigned int irq, unsigned int type) return -ENODEV; } - desc = irq_desc + irq; + desc = irq_to_desc(irq); if (type == IRQ_TYPE_NONE) return 0; @@ -155,7 +155,7 @@ int set_irq_data(unsigned int irq, void *data) return -EINVAL; } - desc = irq_desc + irq; + desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); desc->handler_data = data; spin_unlock_irqrestore(&desc->lock, flags); @@ -180,7 +180,7 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry) "Trying to install msi data for IRQ%d\n", irq); return -EINVAL; } - desc = irq_desc + irq; + desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); desc->msi_desc = entry; if (entry) @@ -198,9 +198,10 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry) */ int set_irq_chip_data(unsigned int irq, void *data) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc; unsigned long flags; + desc = irq_to_desc(irq); if (irq >= nr_irqs || !desc->chip) { printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); return -EINVAL; @@ -219,8 +220,9 @@ EXPORT_SYMBOL(set_irq_chip_data); */ static void default_enable(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc; + desc = irq_to_desc(irq); desc->chip->unmask(irq); desc->status &= ~IRQ_MASKED; } @@ -237,7 +239,10 @@ static void default_disable(unsigned int irq) */ static unsigned int default_startup(unsigned int irq) { - irq_desc[irq].chip->enable(irq); + struct irq_desc *desc; + + desc = irq_to_desc(irq); + desc->chip->enable(irq); return 0; } @@ -247,8 +252,9 @@ static unsigned int default_startup(unsigned int irq) */ static void default_shutdown(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc; + desc = irq_to_desc(irq); desc->chip->mask(irq); desc->status |= IRQ_MASKED; } @@ -551,7 +557,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, return; } - desc = irq_desc + irq; + desc = irq_to_desc(irq); if (!handle) handle = handle_bad_irq; @@ -616,7 +622,7 @@ void __init set_irq_noprobe(unsigned int irq) return; } - desc = irq_desc + irq; + desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); desc->status |= IRQ_NOPROBE; @@ -634,7 +640,7 @@ void __init set_irq_probe(unsigned int irq) return; } - desc = irq_desc + irq; + desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); desc->status &= ~IRQ_NOPROBE; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 6ce3bcc2b8f..9fc33b3378e 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -18,6 +18,14 @@ #include "internals.h" +#ifdef CONFIG_TRACE_IRQFLAGS + +/* + * lockdep: we want to handle all irq_desc locks as a single lock-class: + */ +static struct lock_class_key irq_desc_lock_class; +#endif + /** * handle_bad_irq - handle spurious and unhandled irqs * @irq: the interrupt number @@ -51,7 +59,8 @@ int nr_irqs = NR_IRQS; EXPORT_SYMBOL_GPL(nr_irqs); #ifdef CONFIG_HAVE_DYN_ARRAY -static struct irq_desc irq_desc_init __initdata = { +static struct irq_desc irq_desc_init = { + .irq = -1U, .status = IRQ_DISABLED, .chip = &no_irq_chip, .handle_irq = handle_bad_irq, @@ -62,6 +71,27 @@ static struct irq_desc irq_desc_init __initdata = { #endif }; + +static void init_one_irq_desc(struct irq_desc *desc) +{ + memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); +#ifdef CONFIG_TRACE_IRQFLAGS + lockdep_set_class(&desc->lock, &irq_desc_lock_class); +#endif +} + +#ifdef CONFIG_HAVE_SPARSE_IRQ +static int nr_irq_desc = 32; + +static int __init parse_nr_irq_desc(char *arg) +{ + if (arg) + nr_irq_desc = simple_strtoul(arg, NULL, 0); + return 0; +} + +early_param("nr_irq_desc", parse_nr_irq_desc); + static void __init init_work(void *data) { struct dyn_array *da = data; @@ -71,12 +101,83 @@ static void __init init_work(void *data) desc = *da->name; for (i = 0; i < *da->nr; i++) - memcpy(&desc[i], &irq_desc_init, sizeof(struct irq_desc)); + init_one_irq_desc(&desc[i]); + + for (i = 1; i < *da->nr; i++) + desc[i-1].next = &desc[i]; } -struct irq_desc *irq_desc; +static struct irq_desc *sparse_irqs; +DEFINE_DYN_ARRAY(sparse_irqs, sizeof(struct irq_desc), nr_irq_desc, PAGE_SIZE, init_work); + +extern int after_bootmem; +extern void *__alloc_bootmem_nopanic(unsigned long size, + unsigned long align, + unsigned long goal); +struct irq_desc *irq_to_desc(unsigned int irq) +{ + struct irq_desc *desc, *desc_pri; + int i; + int count = 0; + + BUG_ON(irq == -1U); + + desc_pri = desc = &sparse_irqs[0]; + while (desc) { + if (desc->irq == irq) + return desc; + + if (desc->irq == -1U) { + desc->irq = irq; + return desc; + } + desc_pri = desc; + desc = desc->next; + count++; + } + + /* + * we run out of pre-allocate ones, allocate more + */ + printk(KERN_DEBUG "try to get more irq_desc %d\n", nr_irq_desc); + + if (after_bootmem) + desc = kzalloc(sizeof(struct irq_desc)*nr_irq_desc, GFP_ATOMIC); + else + desc = __alloc_bootmem_nopanic(sizeof(struct irq_desc)*nr_irq_desc, PAGE_SIZE, 0); + + if (!desc) + panic("please boot with nr_irq_desc= %d\n", count * 2); + + for (i = 0; i < nr_irq_desc; i++) + init_one_irq_desc(&desc[i]); + + for (i = 1; i < nr_irq_desc; i++) + desc[i-1].next = &desc[i]; + + desc->irq = irq; + desc_pri->next = desc; + + return desc; +} +#else +static void __init init_work(void *data) +{ + struct dyn_array *da = data; + int i; + struct irq_desc *desc; + + desc = *da->name; + + for (i = 0; i < *da->nr; i++) + init_one_irq_desc(&desc[i]); + +} +static struct irq_desc *irq_desc; DEFINE_DYN_ARRAY(irq_desc, sizeof(struct irq_desc), nr_irqs, PAGE_SIZE, init_work); +#endif + #else struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { @@ -85,12 +186,23 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { .chip = &no_irq_chip, .handle_irq = handle_bad_irq, .depth = 1, - .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), + .lock = __SPIN_LOCK_UNLOCKED(sparse_irqs->lock), #ifdef CONFIG_SMP .affinity = CPU_MASK_ALL #endif } }; + +#endif + +#ifndef CONFIG_HAVE_SPARSE_IRQ +struct irq_desc *irq_to_desc(unsigned int irq) +{ + if (irq < nr_irqs) + return &irq_desc[irq]; + + return NULL; +} #endif /* @@ -99,7 +211,10 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { */ static void ack_bad(unsigned int irq) { - print_irq_desc(irq, irq_desc + irq); + struct irq_desc *desc; + + desc = irq_to_desc(irq); + print_irq_desc(irq, desc); ack_bad_irq(irq); } @@ -196,7 +311,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) */ unsigned int __do_IRQ(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; unsigned int status; @@ -287,19 +402,16 @@ out: } #endif -#ifdef CONFIG_TRACE_IRQFLAGS - -/* - * lockdep: we want to handle all irq_desc locks as a single lock-class: - */ -static struct lock_class_key irq_desc_lock_class; +#ifdef CONFIG_TRACE_IRQFLAGS void early_init_irq_lock_class(void) { +#ifndef CONFIG_HAVE_DYN_ARRAY int i; for (i = 0; i < nr_irqs; i++) lockdep_set_class(&irq_desc[i].lock, &irq_desc_lock_class); +#endif } - #endif + diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d5a4333d8f1..b5943e9f95a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -31,7 +31,7 @@ cpumask_t irq_default_affinity = CPU_MASK_ALL; */ void synchronize_irq(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); unsigned int status; if (irq >= nr_irqs) @@ -64,7 +64,7 @@ EXPORT_SYMBOL(synchronize_irq); */ int irq_can_set_affinity(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip || !desc->chip->set_affinity) @@ -81,7 +81,7 @@ int irq_can_set_affinity(unsigned int irq) */ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); if (!desc->chip->set_affinity) return -EINVAL; @@ -111,14 +111,16 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) int irq_select_affinity(unsigned int irq) { cpumask_t mask; + struct irq_desc *desc; if (!irq_can_set_affinity(irq)) return 0; cpus_and(mask, cpu_online_map, irq_default_affinity); - irq_desc[irq].affinity = mask; - irq_desc[irq].chip->set_affinity(irq, mask); + desc = irq_to_desc(irq); + desc->affinity = mask; + desc->chip->set_affinity(irq, mask); set_balance_irq_affinity(irq, mask); return 0; @@ -140,7 +142,7 @@ int irq_select_affinity(unsigned int irq) */ void disable_irq_nosync(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; if (irq >= nr_irqs) @@ -169,7 +171,7 @@ EXPORT_SYMBOL(disable_irq_nosync); */ void disable_irq(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); if (irq >= nr_irqs) return; @@ -211,7 +213,7 @@ static void __enable_irq(struct irq_desc *desc, unsigned int irq) */ void enable_irq(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; if (irq >= nr_irqs) @@ -225,7 +227,7 @@ EXPORT_SYMBOL(enable_irq); static int set_irq_wake_real(unsigned int irq, unsigned int on) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); int ret = -ENXIO; if (desc->chip->set_wake) @@ -248,7 +250,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) */ int set_irq_wake(unsigned int irq, unsigned int on) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; int ret = 0; @@ -288,12 +290,13 @@ EXPORT_SYMBOL(set_irq_wake); */ int can_request_irq(unsigned int irq, unsigned long irqflags) { + struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; - if (irq >= nr_irqs || irq_desc[irq].status & IRQ_NOREQUEST) + if (irq >= nr_irqs || desc->status & IRQ_NOREQUEST) return 0; - action = irq_desc[irq].action; + action = desc->action; if (action) if (irqflags & action->flags & IRQF_SHARED) action = NULL; @@ -349,7 +352,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, */ int setup_irq(unsigned int irq, struct irqaction *new) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); struct irqaction *old, **p; const char *old_name = NULL; unsigned long flags; @@ -518,7 +521,7 @@ void free_irq(unsigned int irq, void *dev_id) if (irq >= nr_irqs) return; - desc = irq_desc + irq; + desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); p = &desc->action; for (;;) { @@ -615,6 +618,7 @@ int request_irq(unsigned int irq, irq_handler_t handler, { struct irqaction *action; int retval; + struct irq_desc *desc; #ifdef CONFIG_LOCKDEP /* @@ -632,7 +636,8 @@ int request_irq(unsigned int irq, irq_handler_t handler, return -EINVAL; if (irq >= nr_irqs) return -EINVAL; - if (irq_desc[irq].status & IRQ_NOREQUEST) + desc = irq_to_desc(irq); + if (desc->status & IRQ_NOREQUEST) return -EINVAL; if (!handler) return -EINVAL; diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 77b7acc875c..90b920d3f52 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -3,18 +3,18 @@ void set_pending_irq(unsigned int irq, cpumask_t mask) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; spin_lock_irqsave(&desc->lock, flags); desc->status |= IRQ_MOVE_PENDING; - irq_desc[irq].pending_mask = mask; + desc->pending_mask = mask; spin_unlock_irqrestore(&desc->lock, flags); } void move_masked_irq(int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); cpumask_t tmp; if (likely(!(desc->status & IRQ_MOVE_PENDING))) @@ -30,7 +30,7 @@ void move_masked_irq(int irq) desc->status &= ~IRQ_MOVE_PENDING; - if (unlikely(cpus_empty(irq_desc[irq].pending_mask))) + if (unlikely(cpus_empty(desc->pending_mask))) return; if (!desc->chip->set_affinity) @@ -38,7 +38,7 @@ void move_masked_irq(int irq) assert_spin_locked(&desc->lock); - cpus_and(tmp, irq_desc[irq].pending_mask, cpu_online_map); + cpus_and(tmp, desc->pending_mask, cpu_online_map); /* * If there was a valid mask to work with, please @@ -55,12 +55,12 @@ void move_masked_irq(int irq) if (likely(!cpus_empty(tmp))) { desc->chip->set_affinity(irq,tmp); } - cpus_clear(irq_desc[irq].pending_mask); + cpus_clear(desc->pending_mask); } void move_native_irq(int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); if (likely(!(desc->status & IRQ_MOVE_PENDING))) return; diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index e5225a65a4f..c2f356c808f 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -19,7 +19,7 @@ static struct proc_dir_entry *root_irq_dir; static int irq_affinity_proc_show(struct seq_file *m, void *v) { - struct irq_desc *desc = irq_desc + (long)m->private; + struct irq_desc *desc = irq_to_desc((long)m->private); cpumask_t *mask = &desc->affinity; #ifdef CONFIG_GENERIC_PENDING_IRQ @@ -43,7 +43,7 @@ static ssize_t irq_affinity_proc_write(struct file *file, cpumask_t new_value; int err; - if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || + if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity || irq_balancing_disabled(irq)) return -EIO; @@ -132,20 +132,20 @@ static const struct file_operations default_affinity_proc_fops = { static int irq_spurious_read(char *page, char **start, off_t off, int count, int *eof, void *data) { - struct irq_desc *d = &irq_desc[(long) data]; + struct irq_desc *desc = irq_to_desc((long) data); return sprintf(page, "count %u\n" "unhandled %u\n" "last_unhandled %u ms\n", - d->irq_count, - d->irqs_unhandled, - jiffies_to_msecs(d->last_unhandled)); + desc->irq_count, + desc->irqs_unhandled, + jiffies_to_msecs(desc->last_unhandled)); } #define MAX_NAMELEN 128 static int name_unique(unsigned int irq, struct irqaction *new_action) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; unsigned long flags; int ret = 1; @@ -165,8 +165,9 @@ static int name_unique(unsigned int irq, struct irqaction *new_action) void register_handler_proc(unsigned int irq, struct irqaction *action) { char name [MAX_NAMELEN]; + struct irq_desc *desc = irq_to_desc(irq); - if (!irq_desc[irq].dir || action->dir || !action->name || + if (!desc->dir || action->dir || !action->name || !name_unique(irq, action)) return; @@ -174,7 +175,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) snprintf(name, MAX_NAMELEN, "%s", action->name); /* create /proc/irq/1234/handler/ */ - action->dir = proc_mkdir(name, irq_desc[irq].dir); + action->dir = proc_mkdir(name, desc->dir); } #undef MAX_NAMELEN @@ -185,25 +186,24 @@ void register_irq_proc(unsigned int irq) { char name [MAX_NAMELEN]; struct proc_dir_entry *entry; + struct irq_desc *desc = irq_to_desc(irq); - if (!root_irq_dir || - (irq_desc[irq].chip == &no_irq_chip) || - irq_desc[irq].dir) + if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir) return; memset(name, 0, MAX_NAMELEN); sprintf(name, "%d", irq); /* create /proc/irq/1234 */ - irq_desc[irq].dir = proc_mkdir(name, root_irq_dir); + desc->dir = proc_mkdir(name, root_irq_dir); #ifdef CONFIG_SMP /* create /proc/irq//smp_affinity */ - proc_create_data("smp_affinity", 0600, irq_desc[irq].dir, + proc_create_data("smp_affinity", 0600, desc->dir, &irq_affinity_proc_fops, (void *)(long)irq); #endif - entry = create_proc_entry("spurious", 0444, irq_desc[irq].dir); + entry = create_proc_entry("spurious", 0444, desc->dir); if (entry) { entry->data = (void *)(long)irq; entry->read_proc = irq_spurious_read; @@ -214,8 +214,10 @@ void register_irq_proc(unsigned int irq) void unregister_handler_proc(unsigned int irq, struct irqaction *action) { - if (action->dir) - remove_proc_entry(action->dir->name, irq_desc[irq].dir); + if (action->dir) { + struct irq_desc *desc = irq_to_desc(irq); + remove_proc_entry(action->dir->name, desc->dir); + } } void register_default_affinity_proc(void) diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index cba8aa5bc7f..89c7117acf2 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -36,7 +36,7 @@ static void resend_irqs(unsigned long arg) while (!bitmap_empty(irqs_resend, nr_irqs)) { irq = find_first_bit(irqs_resend, nr_irqs); clear_bit(irq, irqs_resend); - desc = irq_desc + irq; + desc = irq_to_desc(irq); local_irq_disable(); desc->handle_irq(irq, desc); local_irq_enable(); diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index e26ca1e90c0..b5d906002e1 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -92,11 +92,12 @@ static int misrouted_irq(int irq) int ok = 0; for (i = 1; i < nr_irqs; i++) { - struct irq_desc *desc = irq_desc + i; + struct irq_desc *desc; if (i == irq) /* Already tried */ continue; + desc = irq_to_desc(i); if (try_one_irq(i, desc)) ok = 1; } @@ -108,7 +109,7 @@ static void poll_spurious_irqs(unsigned long dummy) { int i; for (i = 1; i < nr_irqs; i++) { - struct irq_desc *desc = irq_desc + i; + struct irq_desc *desc = irq_to_desc(i); unsigned int status; /* Racy but it doesn't matter */ -- cgit v1.2.3-70-g09d2 From 3ac2de48ed3c998df7f366e039c97eedb27e7c3d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:06 -0700 Subject: x86: add irq_cfg in io_apic_64.c preallocate size is 32, and if it is not enough, irq_cfg will more via alloc_bootmem() or kzalloc(). (depending on how early we are in system setup) v2: fix typo about size of init_one_irq_cfg ... should use sizeof(struct irq_cfg) v3: according to Eric, change get_irq_cfg() to irq_cfg() v4: squash add irq_cfg_alloc in Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_64.c | 209 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 169 insertions(+), 40 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index cab5a25d81b..858c37a31a2 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -57,7 +57,11 @@ #define __apicdebuginit(type) static type __init +struct irq_cfg; + struct irq_cfg { + unsigned int irq; + struct irq_cfg *next; cpumask_t domain; cpumask_t old_domain; unsigned move_cleanup_count; @@ -67,34 +71,132 @@ struct irq_cfg { /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ static struct irq_cfg irq_cfg_legacy[] __initdata = { - [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, - [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, - [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, - [3] = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, }, - [4] = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, }, - [5] = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, }, - [6] = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, }, - [7] = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, }, - [8] = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, }, - [9] = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, }, - [10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, - [11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, - [12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, - [13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, - [14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, - [15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, + [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, + [1] = { .irq = 1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, + [2] = { .irq = 2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, + [3] = { .irq = 3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, }, + [4] = { .irq = 4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, }, + [5] = { .irq = 5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, }, + [6] = { .irq = 6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, }, + [7] = { .irq = 7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, }, + [8] = { .irq = 8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, }, + [9] = { .irq = 9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, }, + [10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, + [11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, + [12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, + [13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, + [14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, + [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, }; -static struct irq_cfg *irq_cfg; +static struct irq_cfg irq_cfg_init = { .irq = -1U, }; +/* need to be biger than size of irq_cfg_legacy */ +static int nr_irq_cfg = 32; + +static int __init parse_nr_irq_cfg(char *arg) +{ + if (arg) { + nr_irq_cfg = simple_strtoul(arg, NULL, 0); + if (nr_irq_cfg < 32) + nr_irq_cfg = 32; + } + return 0; +} + +early_param("nr_irq_cfg", parse_nr_irq_cfg); + +static void init_one_irq_cfg(struct irq_cfg *cfg) +{ + memcpy(cfg, &irq_cfg_init, sizeof(struct irq_cfg)); +} static void __init init_work(void *data) { struct dyn_array *da = data; + struct irq_cfg *cfg; + int i; - memcpy(*da->name, irq_cfg_legacy, sizeof(irq_cfg_legacy)); + cfg = *da->name; + + memcpy(cfg, irq_cfg_legacy, sizeof(irq_cfg_legacy)); + + i = sizeof(irq_cfg_legacy)/sizeof(irq_cfg_legacy[0]); + for (; i < *da->nr; i++) + init_one_irq_cfg(&cfg[i]); + + for (i = 1; i < *da->nr; i++) + cfg[i-1].next = &cfg[i]; } -DEFINE_DYN_ARRAY(irq_cfg, sizeof(struct irq_cfg), nr_irqs, PAGE_SIZE, init_work); +static struct irq_cfg *irq_cfgx; +DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irq_cfg, PAGE_SIZE, init_work); + +static struct irq_cfg *irq_cfg(unsigned int irq) +{ + struct irq_cfg *cfg; + + BUG_ON(irq == -1U); + + cfg = &irq_cfgx[0]; + while (cfg) { + if (cfg->irq == irq) + return cfg; + + if (cfg->irq == -1U) + return NULL; + + cfg = cfg->next; + } + + return NULL; +} + +static struct irq_cfg *irq_cfg_alloc(unsigned int irq) +{ + struct irq_cfg *cfg, *cfg_pri; + int i; + int count = 0; + + BUG_ON(irq == -1U); + + cfg_pri = cfg = &irq_cfgx[0]; + while (cfg) { + if (cfg->irq == irq) + return cfg; + + if (cfg->irq == -1U) { + cfg->irq = irq; + return cfg; + } + cfg_pri = cfg; + cfg = cfg->next; + count++; + } + + /* + * we run out of pre-allocate ones, allocate more + */ + printk(KERN_DEBUG "try to get more irq_cfg %d\n", nr_irq_cfg); + + if (after_bootmem) + cfg = kzalloc(sizeof(struct irq_cfg)*nr_irq_cfg, GFP_ATOMIC); + else + cfg = __alloc_bootmem_nopanic(sizeof(struct irq_cfg)*nr_irq_cfg, PAGE_SIZE, 0); + + if (!cfg) + panic("please boot with nr_irq_cfg= %d\n", count * 2); + + for (i = 0; i < nr_irq_cfg; i++) + init_one_irq_cfg(&cfg[i]); + + for (i = 1; i < nr_irq_cfg; i++) + cfg[i-1].next = &cfg[i]; + + cfg->irq = irq; + cfg_pri->next = cfg; + + return cfg; +} static int assign_irq_vector(int irq, cpumask_t mask); @@ -341,7 +443,7 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg = irq_cfg(irq); unsigned long flags; unsigned int dest; cpumask_t tmp; @@ -381,6 +483,8 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin) struct irq_pin_list *entry = irq_2_pin + irq; BUG_ON(irq >= nr_irqs); + irq_cfg_alloc(irq); + while (entry->next) entry = irq_2_pin + entry->next; @@ -819,7 +923,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask) struct irq_cfg *cfg; BUG_ON((unsigned)irq >= nr_irqs); - cfg = &irq_cfg[irq]; + cfg = irq_cfg(irq); /* Only try and allocate irqs on cpus that are present */ cpus_and(mask, mask, cpu_online_map); @@ -893,7 +997,7 @@ static void __clear_irq_vector(int irq) int cpu, vector; BUG_ON((unsigned)irq >= nr_irqs); - cfg = &irq_cfg[irq]; + cfg = irq_cfg(irq); BUG_ON(!cfg->vector); vector = cfg->vector; @@ -913,17 +1017,23 @@ void __setup_vector_irq(int cpu) /* Mark the inuse vectors */ for (irq = 0; irq < nr_irqs; ++irq) { - if (!cpu_isset(cpu, irq_cfg[irq].domain)) + struct irq_cfg *cfg = irq_cfg(irq); + + if (!cpu_isset(cpu, cfg->domain)) continue; - vector = irq_cfg[irq].vector; + vector = cfg->vector; per_cpu(vector_irq, cpu)[vector] = irq; } /* Mark the free vectors */ for (vector = 0; vector < NR_VECTORS; ++vector) { + struct irq_cfg *cfg; + irq = per_cpu(vector_irq, cpu)[vector]; if (irq < 0) continue; - if (!cpu_isset(cpu, irq_cfg[irq].domain)) + + cfg = irq_cfg(irq); + if (!cpu_isset(cpu, cfg->domain)) per_cpu(vector_irq, cpu)[vector] = -1; } } @@ -1029,13 +1139,15 @@ static int setup_ioapic_entry(int apic, int irq, static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, int trigger, int polarity) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg; struct IO_APIC_route_entry entry; cpumask_t mask; if (!IO_APIC_IRQ(irq)) return; + cfg = irq_cfg(irq); + mask = TARGET_CPUS; if (assign_irq_vector(irq, mask)) return; @@ -1553,7 +1665,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq) static int ioapic_retrigger_irq(unsigned int irq) { - struct irq_cfg *cfg = &irq_cfg[irq]; + struct irq_cfg *cfg = irq_cfg(irq); unsigned long flags; spin_lock_irqsave(&vector_lock, flags); @@ -1600,7 +1712,7 @@ static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration); */ static void migrate_ioapic_irq(int irq, cpumask_t mask) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg; struct irq_desc *desc; cpumask_t tmp, cleanup_mask; struct irte irte; @@ -1618,6 +1730,7 @@ static void migrate_ioapic_irq(int irq, cpumask_t mask) if (assign_irq_vector(irq, mask)) return; + cfg = irq_cfg(irq); cpus_and(tmp, cfg->domain, mask); dest = cpu_mask_to_apicid(tmp); @@ -1735,7 +1848,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) continue; desc = irq_to_desc(irq); - cfg = irq_cfg + irq; + cfg = irq_cfg(irq); spin_lock(&desc->lock); if (!cfg->move_cleanup_count) goto unlock; @@ -1754,7 +1867,7 @@ unlock: static void irq_complete_move(unsigned int irq) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg = irq_cfg(irq); unsigned vector, me; if (likely(!cfg->move_in_progress)) @@ -1891,7 +2004,10 @@ static inline void init_IO_APIC_traps(void) * 0x80, because int 0x80 is hm, kind of importantish. ;) */ for (irq = 0; irq < nr_irqs ; irq++) { - if (IO_APIC_IRQ(irq) && !irq_cfg[irq].vector) { + struct irq_cfg *cfg; + + cfg = irq_cfg(irq); + if (IO_APIC_IRQ(irq) && !cfg->vector) { /* * Hmm.. We don't have an entry for this, * so default to an old-fashioned 8259 @@ -2028,7 +2144,7 @@ static inline void __init unlock_ExtINT_logic(void) */ static inline void __init check_timer(void) { - struct irq_cfg *cfg = irq_cfg + 0; + struct irq_cfg *cfg = irq_cfg(0); int apic1, pin1, apic2, pin2; unsigned long flags; int no_pin1 = 0; @@ -2306,14 +2422,19 @@ int create_irq(void) int irq; int new; unsigned long flags; + struct irq_cfg *cfg_new; irq = -ENOSPC; spin_lock_irqsave(&vector_lock, flags); for (new = (nr_irqs - 1); new >= 0; new--) { if (platform_legacy_irq(new)) continue; - if (irq_cfg[new].vector != 0) + cfg_new = irq_cfg(new); + if (cfg_new && cfg_new->vector != 0) continue; + /* check if need to create one */ + if (!cfg_new) + cfg_new = irq_cfg_alloc(new); if (__assign_irq_vector(new, TARGET_CPUS) == 0) irq = new; break; @@ -2346,7 +2467,7 @@ void destroy_irq(unsigned int irq) #ifdef CONFIG_PCI_MSI static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg; int err; unsigned dest; cpumask_t tmp; @@ -2356,6 +2477,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms if (err) return err; + cfg = irq_cfg(irq); cpus_and(tmp, cfg->domain, tmp); dest = cpu_mask_to_apicid(tmp); @@ -2413,7 +2535,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms #ifdef CONFIG_SMP static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg; struct msi_msg msg; unsigned int dest; cpumask_t tmp; @@ -2426,6 +2548,7 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) if (assign_irq_vector(irq, mask)) return; + cfg = irq_cfg(irq); cpus_and(tmp, cfg->domain, mask); dest = cpu_mask_to_apicid(tmp); @@ -2448,7 +2571,7 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) */ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg; unsigned int dest; cpumask_t tmp, cleanup_mask; struct irte irte; @@ -2464,6 +2587,7 @@ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) if (assign_irq_vector(irq, mask)) return; + cfg = irq_cfg(irq); cpus_and(tmp, cfg->domain, mask); dest = cpu_mask_to_apicid(tmp); @@ -2670,7 +2794,7 @@ void arch_teardown_msi_irq(unsigned int irq) #ifdef CONFIG_SMP static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg; struct msi_msg msg; unsigned int dest; cpumask_t tmp; @@ -2683,6 +2807,7 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) if (assign_irq_vector(irq, mask)) return; + cfg = irq_cfg(irq); cpus_and(tmp, cfg->domain, mask); dest = cpu_mask_to_apicid(tmp); @@ -2749,7 +2874,7 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg; unsigned int dest; cpumask_t tmp; struct irq_desc *desc; @@ -2761,6 +2886,7 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) if (assign_irq_vector(irq, mask)) return; + cfg = irq_cfg(irq); cpus_and(tmp, cfg->domain, mask); dest = cpu_mask_to_apicid(tmp); @@ -2783,7 +2909,7 @@ static struct irq_chip ht_irq_chip = { int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg; int err; cpumask_t tmp; @@ -2793,6 +2919,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) struct ht_irq_msg msg; unsigned dest; + cfg = irq_cfg(irq); cpus_and(tmp, cfg->domain, tmp); dest = cpu_mask_to_apicid(tmp); @@ -2891,6 +3018,7 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) void __init setup_ioapic_dest(void) { int pin, ioapic, irq, irq_entry; + struct irq_cfg *cfg; if (skip_ioapic_setup == 1) return; @@ -2906,7 +3034,8 @@ void __init setup_ioapic_dest(void) * when you have too many devices, because at that time only boot * cpu is online. */ - if (!irq_cfg[irq].vector) + cfg = irq_cfg(irq); + if (!cfg->vector) setup_IO_APIC_irq(ioapic, pin, irq, irq_trigger(irq_entry), irq_polarity(irq_entry)); -- cgit v1.2.3-70-g09d2 From e5a53714acfc7b5f868d07d27c5f02cb00b118db Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:07 -0700 Subject: x86: put irq_2_pin pointer into irq_cfg preallocate 32 irq_2_pin entries, and use get_one_free_irq_2_pin() to get one more and link to irq_cfg if needed. so don't waste one where no irq is enabled. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_64.c | 161 ++++++++++++++++++++++++++++++++----------- 1 file changed, 120 insertions(+), 41 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 858c37a31a2..51ef7eb75f2 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -58,10 +58,11 @@ #define __apicdebuginit(type) static type __init struct irq_cfg; - +struct irq_pin_list; struct irq_cfg { unsigned int irq; struct irq_cfg *next; + struct irq_pin_list *irq_2_pin; cpumask_t domain; cpumask_t old_domain; unsigned move_cleanup_count; @@ -252,13 +253,66 @@ int pin_map_size; * between pins and IRQs. */ -static struct irq_pin_list { - short apic, pin; - int next; -} *irq_2_pin; +struct irq_pin_list { + int apic, pin; + struct irq_pin_list *next; +}; + +static struct irq_pin_list *irq_2_pin_head; +/* fill one page ? */ +static int nr_irq_2_pin = 0x100; +static struct irq_pin_list *irq_2_pin_ptr; +static void __init irq_2_pin_init_work(void *data) +{ + struct dyn_array *da = data; + struct irq_pin_list *pin; + int i; + + pin = *da->name; + + for (i = 1; i < *da->nr; i++) + pin[i-1].next = &pin[i]; + + irq_2_pin_ptr = &pin[0]; +} +DEFINE_DYN_ARRAY(irq_2_pin_head, sizeof(struct irq_pin_list), nr_irq_2_pin, PAGE_SIZE, irq_2_pin_init_work); + +static struct irq_pin_list *get_one_free_irq_2_pin(void) +{ + struct irq_pin_list *pin; + int i; + + pin = irq_2_pin_ptr; + + if (pin) { + irq_2_pin_ptr = pin->next; + pin->next = NULL; + return pin; + } + + /* + * we run out of pre-allocate ones, allocate more + */ + printk(KERN_DEBUG "try to get more irq_2_pin %d\n", nr_irq_2_pin); + + if (after_bootmem) + pin = kzalloc(sizeof(struct irq_pin_list)*nr_irq_2_pin, + GFP_ATOMIC); + else + pin = __alloc_bootmem_nopanic(sizeof(struct irq_pin_list) * + nr_irq_2_pin, PAGE_SIZE, 0); + + if (!pin) + panic("can not get more irq_2_pin\n"); -DEFINE_DYN_ARRAY(irq_2_pin, sizeof(struct irq_pin_list), pin_map_size, sizeof(struct irq_pin_list), NULL); + for (i = 1; i < nr_irq_2_pin; i++) + pin[i-1].next = &pin[i]; + irq_2_pin_ptr = pin->next; + pin->next = NULL; + + return pin; +} struct io_apic { unsigned int index; @@ -300,16 +354,17 @@ static bool io_apic_level_ack_pending(unsigned int irq) { struct irq_pin_list *entry; unsigned long flags; + struct irq_cfg *cfg = irq_cfg(irq); spin_lock_irqsave(&ioapic_lock, flags); - entry = irq_2_pin + irq; + entry = cfg->irq_2_pin; for (;;) { unsigned int reg; int pin; - pin = entry->pin; - if (pin == -1) + if (!entry) break; + pin = entry->pin; reg = io_apic_read(entry->apic, 0x10 + pin*2); /* Is the remote IRR bit set? */ if (reg & IO_APIC_REDIR_REMOTE_IRR) { @@ -318,7 +373,7 @@ static bool io_apic_level_ack_pending(unsigned int irq) } if (!entry->next) break; - entry = irq_2_pin + entry->next; + entry = entry->next; } spin_unlock_irqrestore(&ioapic_lock, flags); @@ -339,21 +394,24 @@ static inline void io_apic_sync(unsigned int apic) \ { \ int pin; \ - struct irq_pin_list *entry = irq_2_pin + irq; \ + struct irq_cfg *cfg; \ + struct irq_pin_list *entry; \ \ BUG_ON(irq >= nr_irqs); \ + cfg = irq_cfg(irq); \ + entry = cfg->irq_2_pin; \ for (;;) { \ unsigned int reg; \ - pin = entry->pin; \ - if (pin == -1) \ + if (!entry) \ break; \ + pin = entry->pin; \ reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ reg ACTION; \ io_apic_modify(entry->apic, reg); \ FINAL; \ if (!entry->next) \ break; \ - entry = irq_2_pin + entry->next; \ + entry = entry->next; \ } \ } @@ -416,15 +474,20 @@ static void ioapic_mask_entry(int apic, int pin) static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) { int apic, pin; - struct irq_pin_list *entry = irq_2_pin + irq; + struct irq_cfg *cfg; + struct irq_pin_list *entry; BUG_ON(irq >= nr_irqs); + cfg = irq_cfg(irq); + entry = cfg->irq_2_pin; for (;;) { unsigned int reg; + + if (!entry) + break; + apic = entry->apic; pin = entry->pin; - if (pin == -1) - break; /* * With interrupt-remapping, destination information comes * from interrupt-remapping table entry. @@ -437,7 +500,7 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) io_apic_modify(apic, reg); if (!entry->next) break; - entry = irq_2_pin + entry->next; + entry = entry->next; } } @@ -480,22 +543,35 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) int first_free_entry; static void add_pin_to_irq(unsigned int irq, int apic, int pin) { - struct irq_pin_list *entry = irq_2_pin + irq; + struct irq_cfg *cfg; + struct irq_pin_list *entry; BUG_ON(irq >= nr_irqs); - irq_cfg_alloc(irq); + /* first time to refer irq_cfg, so with new */ + cfg = irq_cfg_alloc(irq); + entry = cfg->irq_2_pin; + if (!entry) { + entry = get_one_free_irq_2_pin(); + cfg->irq_2_pin = entry; + entry->apic = apic; + entry->pin = pin; + printk(KERN_DEBUG " 0 add_pin_to_irq: irq %d --> apic %d pin %d\n", irq, apic, pin); + return; + } - while (entry->next) - entry = irq_2_pin + entry->next; + while (entry->next) { + /* not again, please */ + if (entry->apic == apic && entry->pin == pin) + return; - if (entry->pin != -1) { - entry->next = first_free_entry; - entry = irq_2_pin + entry->next; - if (++first_free_entry >= pin_map_size) - panic("io_apic.c: ran out of irq_2_pin entries!"); + entry = entry->next; } + + entry->next = get_one_free_irq_2_pin(); + entry = entry->next; entry->apic = apic; entry->pin = pin; + printk(KERN_DEBUG " x add_pin_to_irq: irq %d --> apic %d pin %d\n", irq, apic, pin); } /* @@ -505,17 +581,24 @@ static void __init replace_pin_at_irq(unsigned int irq, int oldapic, int oldpin, int newapic, int newpin) { - struct irq_pin_list *entry = irq_2_pin + irq; + struct irq_cfg *cfg = irq_cfg(irq); + struct irq_pin_list *entry = cfg->irq_2_pin; + int replaced = 0; - while (1) { + while (entry) { if (entry->apic == oldapic && entry->pin == oldpin) { entry->apic = newapic; entry->pin = newpin; - } - if (!entry->next) + replaced = 1; + /* every one is different, right? */ break; - entry = irq_2_pin + entry->next; + } + entry = entry->next; } + + /* why? call replace before add? */ + if (!replaced) + add_pin_to_irq(irq, newapic, newpin); } @@ -1326,15 +1409,16 @@ __apicdebuginit(void) print_IO_APIC(void) } printk(KERN_DEBUG "IRQ to pin mappings:\n"); for (i = 0; i < nr_irqs; i++) { - struct irq_pin_list *entry = irq_2_pin + i; - if (entry->pin < 0) + struct irq_cfg *cfg = irq_cfg(i); + struct irq_pin_list *entry = cfg->irq_2_pin; + if (!entry) continue; printk(KERN_DEBUG "IRQ%d ", i); for (;;) { printk("-> %d:%d", entry->apic, entry->pin); if (!entry->next) break; - entry = irq_2_pin + entry->next; + entry = entry->next; } printk("\n"); } @@ -1495,14 +1579,9 @@ void __init enable_IO_APIC(void) { union IO_APIC_reg_01 reg_01; int i8259_apic, i8259_pin; - int i, apic; + int apic; unsigned long flags; - for (i = 0; i < pin_map_size; i++) { - irq_2_pin[i].pin = -1; - irq_2_pin[i].next = 0; - } - /* * The number of IO-APIC IRQ registers (== #pins): */ -- cgit v1.2.3-70-g09d2 From 7f95ec9e4c12fd067febfd57532da1166d75d858 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:09 -0700 Subject: x86: move kstat_irqs from kstat to irq_desc based on Eric's patch ... together mold it with dyn_array for irq_desc, will allcate kstat_irqs for nr_irq_desc alltogether if needed. -- at that point nr_cpus is known already. v2: make sure system without generic_hardirqs works they don't have irq_desc v3: fix merging v4: [mingo@elte.hu] fix typo [ mingo@elte.hu ] irq: build fix fix: arch/x86/xen/spinlock.c: In function 'xen_spin_lock_slow': arch/x86/xen/spinlock.c:90: error: 'struct kernel_stat' has no member named 'irqs' Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_32.c | 2 +- arch/x86/kernel/irq_32.c | 4 +- arch/x86/kernel/irq_64.c | 4 +- arch/x86/kernel/visws_quirks.c | 2 +- arch/x86/xen/spinlock.c | 2 +- fs/proc/proc_misc.c | 2 +- include/linux/irq.h | 7 +++ include/linux/kernel_stat.h | 22 +++++++--- kernel/irq/chip.c | 15 +++---- kernel/irq/handle.c | 97 ++++++++++++++++++++++++++++++------------ kernel/sched.c | 5 +-- 11 files changed, 106 insertions(+), 56 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index c2160cfdec9..204884b1415 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -526,7 +526,7 @@ static void do_irq_balance(void) if (package_index == i) IRQ_DELTA(package_index, j) = 0; /* Determine the total count per processor per IRQ */ - value_now = (unsigned long) kstat_cpu(i).irqs[j]; + value_now = (unsigned long) kstat_irqs_cpu(j, i); /* Determine the activity per processor per IRQ */ delta = value_now - LAST_CPU_IRQ(i, j); diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index ede513be517..576c5df6cad 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -280,7 +280,7 @@ int show_interrupts(struct seq_file *p, void *v) any_count = kstat_irqs(i); #else for_each_online_cpu(j) - any_count |= kstat_cpu(j).irqs[i]; + any_count |= kstat_irqs_cpu(i, j); #endif action = desc->action; if (!action && !any_count) @@ -290,7 +290,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_printf(p, "%10u ", kstat_irqs(i)); #else for_each_online_cpu(j) - seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); + seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); #endif seq_printf(p, " %8s", desc->chip->name); seq_printf(p, "-%-8s", desc->name); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 738eb65a924..4a0a4eb44dc 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -90,7 +90,7 @@ int show_interrupts(struct seq_file *p, void *v) any_count = kstat_irqs(i); #else for_each_online_cpu(j) - any_count |= kstat_cpu(j).irqs[i]; + any_count |= kstat_irqs_cpu(i, j); #endif action = desc->action; if (!action && !any_count) @@ -100,7 +100,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_printf(p, "%10u ", kstat_irqs(i)); #else for_each_online_cpu(j) - seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); + seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); #endif seq_printf(p, " %8s", desc->chip->name); seq_printf(p, "-%-8s", desc->name); diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 9d85ab38443..817aa55a120 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -633,7 +633,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) /* * handle this 'virtual interrupt' as a Cobalt one now. */ - kstat_cpu(smp_processor_id()).irqs[realirq]++; + kstat_irqs_this_cpu(desc)++; if (likely(desc->action != NULL)) handle_IRQ_event(realirq, desc->action); diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index dd71e3a021c..bb6bc721b13 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -241,7 +241,7 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq)); } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */ - kstat_this_cpu.irqs[irq]++; + kstat_irqs_this_cpu(irq_to_desc(irq))++; out: raw_local_irq_restore(flags); diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index a2173a2a562..aa069acf61a 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -532,7 +532,7 @@ static int show_stat(struct seq_file *p, void *v) steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); for (j = 0; j < nr_irqs; j++) { - unsigned int temp = kstat_cpu(i).irqs[j]; + unsigned int temp = kstat_irqs_cpu(j, i); sum += temp; per_irq_sum[j] += temp; } diff --git a/include/linux/irq.h b/include/linux/irq.h index 60c856aaac0..cbf471aee1c 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -157,6 +157,11 @@ struct irq_desc { #ifdef CONFIG_HAVE_SPARSE_IRQ struct irq_desc *next; struct timer_rand_state *timer_rand_state; +#endif +#ifdef CONFIG_HAVE_DYN_ARRAY + unsigned int *kstat_irqs; +#else + unsigned int kstat_irqs[NR_CPUS]; #endif irq_flow_handler_t handle_irq; struct irq_chip *chip; @@ -190,6 +195,8 @@ extern struct irq_desc *irq_to_desc(unsigned int irq); /* could be removed if we get rid of all irq_desc reference */ extern struct irq_desc irq_desc[NR_IRQS]; #endif +#define kstat_irqs_this_cpu(DESC) \ + ((DESC)->kstat_irqs[smp_processor_id()]) /* * Migration helpers for obsolete names, they will go away: diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index fe1f7fe534b..f10616712de 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -28,10 +28,8 @@ struct cpu_usage_stat { struct kernel_stat { struct cpu_usage_stat cpustat; -#ifdef CONFIG_HAVE_DYN_ARRAY - unsigned int *irqs; -#else - unsigned int irqs[NR_IRQS]; +#ifndef CONFIG_GENERIC_HARDIRQS + unsigned int irqs[NR_IRQS]; #endif }; @@ -43,15 +41,25 @@ DECLARE_PER_CPU(struct kernel_stat, kstat); extern unsigned long long nr_context_switches(void); +#ifndef CONFIG_GENERIC_HARDIRQS +static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) +{ + return kstat_cpu(cpu).irqs[irq]; +} +#else +extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu); +#endif + /* * Number of interrupts per specific IRQ source, since bootup */ -static inline int kstat_irqs(int irq) +static inline unsigned int kstat_irqs(unsigned int irq) { - int cpu, sum = 0; + unsigned int sum = 0; + int cpu; for_each_possible_cpu(cpu) - sum += kstat_cpu(cpu).irqs[irq]; + sum += kstat_irqs_cpu(irq, cpu); return sum; } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 76c225cf4b2..2aa3d4b2fce 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -312,14 +312,13 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) { struct irqaction *action; irqreturn_t action_ret; - const unsigned int cpu = smp_processor_id(); spin_lock(&desc->lock); if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); - kstat_cpu(cpu).irqs[irq]++; + kstat_irqs_this_cpu(desc)++; action = desc->action; if (unlikely(!action || (desc->status & IRQ_DISABLED))) @@ -351,7 +350,6 @@ out_unlock: void handle_level_irq(unsigned int irq, struct irq_desc *desc) { - unsigned int cpu = smp_processor_id(); struct irqaction *action; irqreturn_t action_ret; @@ -361,7 +359,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); - kstat_cpu(cpu).irqs[irq]++; + kstat_irqs_this_cpu(desc)++; /* * If its disabled or no action available @@ -399,7 +397,6 @@ out_unlock: void handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) { - unsigned int cpu = smp_processor_id(); struct irqaction *action; irqreturn_t action_ret; @@ -409,7 +406,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) goto out; desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); - kstat_cpu(cpu).irqs[irq]++; + kstat_irqs_this_cpu(desc)++; /* * If its disabled or no action available @@ -458,8 +455,6 @@ out: void handle_edge_irq(unsigned int irq, struct irq_desc *desc) { - const unsigned int cpu = smp_processor_id(); - spin_lock(&desc->lock); desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); @@ -476,7 +471,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) goto out_unlock; } - kstat_cpu(cpu).irqs[irq]++; + kstat_irqs_this_cpu(desc)++; /* Start handling the irq */ desc->chip->ack(irq); @@ -531,7 +526,7 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) { irqreturn_t action_ret; - kstat_this_cpu.irqs[irq]++; + kstat_irqs_this_cpu(desc)++; if (desc->chip->ack) desc->chip->ack(irq); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 9fc33b3378e..1f346990f3f 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -37,7 +37,7 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc) { print_irq_desc(irq, desc); - kstat_this_cpu.irqs[irq]++; + kstat_irqs_this_cpu(desc)++; ack_bad_irq(irq); } @@ -80,17 +80,38 @@ static void init_one_irq_desc(struct irq_desc *desc) #endif } -#ifdef CONFIG_HAVE_SPARSE_IRQ -static int nr_irq_desc = 32; +extern int after_bootmem; +extern void *__alloc_bootmem_nopanic(unsigned long size, + unsigned long align, + unsigned long goal); -static int __init parse_nr_irq_desc(char *arg) +static void init_kstat_irqs(struct irq_desc *desc, int nr_desc, int nr) { - if (arg) - nr_irq_desc = simple_strtoul(arg, NULL, 0); - return 0; + unsigned long bytes, total_bytes; + char *ptr; + int i; + unsigned long phys; + + /* Compute how many bytes we need per irq and allocate them */ + bytes = nr * sizeof(unsigned int); + total_bytes = bytes * nr_desc; + if (after_bootmem) + ptr = kzalloc(total_bytes, GFP_ATOMIC); + else + ptr = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0); + + if (!ptr) + panic(" can not allocate kstat_irqs\n"); + + phys = __pa(ptr); + printk(KERN_DEBUG "kstat_irqs ==> [%#lx - %#lx]\n", phys, phys + total_bytes); + + for (i = 0; i < nr_desc; i++) { + desc[i].kstat_irqs = (unsigned int *)ptr; + ptr += bytes; + } } -early_param("nr_irq_desc", parse_nr_irq_desc); static void __init init_work(void *data) { @@ -100,25 +121,44 @@ static void __init init_work(void *data) desc = *da->name; - for (i = 0; i < *da->nr; i++) + for (i = 0; i < *da->nr; i++) { init_one_irq_desc(&desc[i]); +#ifndef CONFIG_HAVE_SPARSE_IRQ + desc[i].irq = i; +#endif + } +#ifdef CONFIG_HAVE_SPARSE_IRQ for (i = 1; i < *da->nr; i++) desc[i-1].next = &desc[i]; +#endif + + /* init kstat_irqs, nr_cpu_ids is ready already */ + init_kstat_irqs(desc, *da->nr, nr_cpu_ids); } +#ifdef CONFIG_HAVE_SPARSE_IRQ +static int nr_irq_desc = 32; + +static int __init parse_nr_irq_desc(char *arg) +{ + if (arg) + nr_irq_desc = simple_strtoul(arg, NULL, 0); + return 0; +} + +early_param("nr_irq_desc", parse_nr_irq_desc); + static struct irq_desc *sparse_irqs; DEFINE_DYN_ARRAY(sparse_irqs, sizeof(struct irq_desc), nr_irq_desc, PAGE_SIZE, init_work); -extern int after_bootmem; -extern void *__alloc_bootmem_nopanic(unsigned long size, - unsigned long align, - unsigned long goal); struct irq_desc *irq_to_desc(unsigned int irq) { struct irq_desc *desc, *desc_pri; int i; int count = 0; + unsigned long phys; + unsigned long total_bytes; BUG_ON(irq == -1U); @@ -141,38 +181,34 @@ struct irq_desc *irq_to_desc(unsigned int irq) */ printk(KERN_DEBUG "try to get more irq_desc %d\n", nr_irq_desc); + total_bytes = sizeof(struct irq_desc) * nr_irq_desc; if (after_bootmem) - desc = kzalloc(sizeof(struct irq_desc)*nr_irq_desc, GFP_ATOMIC); + desc = kzalloc(total_bytes, GFP_ATOMIC); else - desc = __alloc_bootmem_nopanic(sizeof(struct irq_desc)*nr_irq_desc, PAGE_SIZE, 0); + desc = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0); if (!desc) panic("please boot with nr_irq_desc= %d\n", count * 2); + phys = __pa(desc); + printk(KERN_DEBUG "irq_desc ==> [%#lx - %#lx]\n", phys, phys + total_bytes); + for (i = 0; i < nr_irq_desc; i++) init_one_irq_desc(&desc[i]); for (i = 1; i < nr_irq_desc; i++) desc[i-1].next = &desc[i]; + /* init kstat_irqs, nr_cpu_ids is ready already */ + init_kstat_irqs(desc, nr_irq_desc, nr_cpu_ids); + desc->irq = irq; desc_pri->next = desc; return desc; } #else -static void __init init_work(void *data) -{ - struct dyn_array *da = data; - int i; - struct irq_desc *desc; - - desc = *da->name; - for (i = 0; i < *da->nr; i++) - init_one_irq_desc(&desc[i]); - -} static struct irq_desc *irq_desc; DEFINE_DYN_ARRAY(irq_desc, sizeof(struct irq_desc), nr_irqs, PAGE_SIZE, init_work); @@ -315,7 +351,7 @@ unsigned int __do_IRQ(unsigned int irq) struct irqaction *action; unsigned int status; - kstat_this_cpu.irqs[irq]++; + kstat_irqs_this_cpu(desc)++; if (CHECK_IRQ_PER_CPU(desc->status)) { irqreturn_t action_ret; @@ -415,3 +451,10 @@ void early_init_irq_lock_class(void) } #endif +unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) +{ + struct irq_desc *desc = irq_to_desc(irq); + return desc->kstat_irqs[cpu]; +} +EXPORT_SYMBOL(kstat_irqs_cpu); + diff --git a/kernel/sched.c b/kernel/sched.c index b9d713781b5..6f230596bd0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4048,11 +4048,8 @@ static inline void idle_balance(int cpu, struct rq *rq) #endif DEFINE_PER_CPU(struct kernel_stat, kstat); -EXPORT_PER_CPU_SYMBOL(kstat); -#ifdef CONFIG_HAVE_DYN_ARRAY -DEFINE_PER_CPU_DYN_ARRAY_ADDR(per_cpu__kstat_irqs, per_cpu__kstat.irqs, sizeof(unsigned int), nr_irqs, sizeof(unsigned long), NULL); -#endif +EXPORT_PER_CPU_SYMBOL(kstat); /* * Return p->sum_exec_runtime plus any more ns on the sched_clock -- cgit v1.2.3-70-g09d2 From 2c6927a38f65b53b62f86158fba29a068c4e8b6a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:11 -0700 Subject: irq: replace loop with nr_irqs with for_each_irq_desc There are a handful of loops that go from 0 to nr_irqs and use get_irq_desc() on them. These would allocate all the irq_desc entries, regardless of the need for them. Use the smarter for_each_irq_desc() iterator that will only iterate over the present ones. v2: make sure arch without GENERIC_HARDIRQS work too Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_64.c | 6 +++--- arch/x86/kernel/irq_64.c | 5 ++--- arch/x86/kernel/irqinit_64.c | 17 +++++------------ include/linux/irq.h | 7 +++++++ kernel/irq/internals.h | 4 ++-- kernel/irq/manage.c | 2 +- kernel/irq/proc.c | 10 +++++----- 7 files changed, 25 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 51ef7eb75f2..708be9724da 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -1871,10 +1871,10 @@ unmask: static void ir_irq_migration(struct work_struct *work) { - int irq; + unsigned int irq; + struct irq_desc *desc; - for (irq = 0; irq < nr_irqs; irq++) { - struct irq_desc *desc = irq_to_desc(irq); + for_each_irq_desc(irq, desc) { if (desc->status & IRQ_MOVE_PENDING) { unsigned long flags; diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 4a0a4eb44dc..b3cf55e325f 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -224,17 +224,16 @@ void fixup_irqs(cpumask_t map) { unsigned int irq; static int warned; + struct irq_desc *desc; - for (irq = 0; irq < nr_irqs; irq++) { + for_each_irq_desc(irq, desc) { cpumask_t mask; int break_affinity = 0; int set_affinity = 1; - struct irq_desc *desc; if (irq == 2) continue; - desc = irq_to_desc(irq); /* interrupt's are disabled at this point */ spin_lock(&desc->lock); diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 0744b49b4d1..cd9f42d028d 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -142,25 +142,18 @@ void __init init_ISA_irqs(void) init_bsp_APIC(); init_8259A(0); - for (i = 0; i < nr_irqs; i++) { + for (i = 0; i < 16; i++) { struct irq_desc *desc = irq_to_desc(i); desc->status = IRQ_DISABLED; desc->action = NULL; desc->depth = 1; - if (i < 16) { - /* - * 16 old-style INTA-cycle interrupts: - */ - set_irq_chip_and_handler_name(i, &i8259A_chip, + /* + * 16 old-style INTA-cycle interrupts: + */ + set_irq_chip_and_handler_name(i, &i8259A_chip, handle_level_irq, "XT"); - } else { - /* - * 'high' PCI IRQs filled in on demand - */ - desc->chip = &no_irq_chip; - } } } diff --git a/include/linux/irq.h b/include/linux/irq.h index c9ffef7c3b4..9de16ca8b8e 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -202,9 +202,16 @@ extern struct irq_desc irq_desc[NR_IRQS]; extern struct irq_desc *irq_desc; #endif +#ifdef CONFIG_GENERIC_HARDIRQS +#define for_each_irq_desc(irq, desc) \ + for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc = &irq_desc[irq]) +#endif + #else extern struct irq_desc *sparse_irqs; +#define for_each_irq_desc(irqX, desc) \ + for (desc = sparse_irqs, irqX = desc->irq; desc && irqX != -1U; desc = desc->next, irqX = desc ? desc->irq : -1U) #endif diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 422dd00c8bd..c9767e64198 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -14,11 +14,11 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, unsigned long flags); #ifdef CONFIG_PROC_FS -extern void register_irq_proc(unsigned int irq); +extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); extern void register_handler_proc(unsigned int irq, struct irqaction *action); extern void unregister_handler_proc(unsigned int irq, struct irqaction *action); #else -static inline void register_irq_proc(unsigned int irq) { } +static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { } static inline void register_handler_proc(unsigned int irq, struct irqaction *action) { } static inline void unregister_handler_proc(unsigned int irq, diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index b5943e9f95a..5070f55fdc1 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -478,7 +478,7 @@ int setup_irq(unsigned int irq, struct irqaction *new) spin_unlock_irqrestore(&desc->lock, flags); new->irq = irq; - register_irq_proc(irq); + register_irq_proc(irq, desc); new->dir = NULL; register_handler_proc(irq, new); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index c2f356c808f..bc0993d86c8 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -182,11 +182,10 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) #define MAX_NAMELEN 10 -void register_irq_proc(unsigned int irq) +void register_irq_proc(unsigned int irq, struct irq_desc *desc) { char name [MAX_NAMELEN]; struct proc_dir_entry *entry; - struct irq_desc *desc = irq_to_desc(irq); if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir) return; @@ -230,7 +229,8 @@ void register_default_affinity_proc(void) void init_irq_proc(void) { - int i; + unsigned int irq; + struct irq_desc *desc; /* create /proc/irq */ root_irq_dir = proc_mkdir("irq", NULL); @@ -242,7 +242,7 @@ void init_irq_proc(void) /* * Create entries for all existing IRQs. */ - for (i = 0; i < nr_irqs; i++) - register_irq_proc(i); + for_each_irq_desc(irq, desc) + register_irq_proc(irq, desc); } -- cgit v1.2.3-70-g09d2 From 46b8214d12c274bd4265aae482ab7ffe69d94818 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:13 -0700 Subject: x86, ioapic: replace loop with nr_irqs with for_each_irq_icfg Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_64.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 708be9724da..60d60061659 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -129,6 +129,9 @@ static void __init init_work(void *data) cfg[i-1].next = &cfg[i]; } +#define for_each_irq_cfg(cfg) \ + for (cfg = irq_cfgx; cfg && cfg->irq != -1U; cfg = cfg->next) + static struct irq_cfg *irq_cfgx; DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irq_cfg, PAGE_SIZE, init_work); @@ -1097,20 +1100,18 @@ void __setup_vector_irq(int cpu) /* Initialize vector_irq on a new cpu */ /* This function must be called with vector_lock held */ int irq, vector; + struct irq_cfg *cfg; /* Mark the inuse vectors */ - for (irq = 0; irq < nr_irqs; ++irq) { - struct irq_cfg *cfg = irq_cfg(irq); - + for_each_irq_cfg(cfg) { if (!cpu_isset(cpu, cfg->domain)) continue; vector = cfg->vector; + irq = cfg->irq; per_cpu(vector_irq, cpu)[vector] = irq; } /* Mark the free vectors */ for (vector = 0; vector < NR_VECTORS; ++vector) { - struct irq_cfg *cfg; - irq = per_cpu(vector_irq, cpu)[vector]; if (irq < 0) continue; @@ -1340,6 +1341,7 @@ __apicdebuginit(void) print_IO_APIC(void) union IO_APIC_reg_01 reg_01; union IO_APIC_reg_02 reg_02; unsigned long flags; + struct irq_cfg *cfg; if (apic_verbosity == APIC_QUIET) return; @@ -1408,12 +1410,11 @@ __apicdebuginit(void) print_IO_APIC(void) } } printk(KERN_DEBUG "IRQ to pin mappings:\n"); - for (i = 0; i < nr_irqs; i++) { - struct irq_cfg *cfg = irq_cfg(i); + for_each_irq_cfg(cfg) { struct irq_pin_list *entry = cfg->irq_2_pin; if (!entry) continue; - printk(KERN_DEBUG "IRQ%d ", i); + printk(KERN_DEBUG "IRQ%d ", cfg->irq); for (;;) { printk("-> %d:%d", entry->apic, entry->pin); if (!entry->next) @@ -2070,6 +2071,7 @@ static inline void init_IO_APIC_traps(void) { int irq; struct irq_desc *desc; + struct irq_cfg *cfg; /* * NOTE! The local APIC isn't very good at handling @@ -2082,10 +2084,8 @@ static inline void init_IO_APIC_traps(void) * Also, we've got to be careful not to trash gate * 0x80, because int 0x80 is hm, kind of importantish. ;) */ - for (irq = 0; irq < nr_irqs ; irq++) { - struct irq_cfg *cfg; - - cfg = irq_cfg(irq); + for_each_irq_cfg(cfg) { + irq = cfg->irq; if (IO_APIC_IRQ(irq) && !cfg->vector) { /* * Hmm.. We don't have an entry for this, -- cgit v1.2.3-70-g09d2 From 7d94f7ca401dd7f445fda9a971a48aa5427b3e55 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:14 -0700 Subject: irq: remove >= nr_irqs checking with config_have_sparse_irq remove irq limit checks - nr_irqs is dynamic and we expand anytime. v2: fix checking about result irq_cfg_without_new, so could use msi again v3: use irq_desc_without_new to check irq is valid Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_64.c | 9 -------- arch/x86/kernel/irq_64.c | 2 +- kernel/irq/chip.c | 49 ++++++++++++++++++++++++-------------------- kernel/irq/manage.c | 43 +++++++++++++++++++++++--------------- 4 files changed, 55 insertions(+), 48 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 60d60061659..1b8cccb5ba2 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -400,7 +400,6 @@ static inline void io_apic_sync(unsigned int apic) struct irq_cfg *cfg; \ struct irq_pin_list *entry; \ \ - BUG_ON(irq >= nr_irqs); \ cfg = irq_cfg(irq); \ entry = cfg->irq_2_pin; \ for (;;) { \ @@ -480,7 +479,6 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) struct irq_cfg *cfg; struct irq_pin_list *entry; - BUG_ON(irq >= nr_irqs); cfg = irq_cfg(irq); entry = cfg->irq_2_pin; for (;;) { @@ -549,7 +547,6 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin) struct irq_cfg *cfg; struct irq_pin_list *entry; - BUG_ON(irq >= nr_irqs); /* first time to refer irq_cfg, so with new */ cfg = irq_cfg_alloc(irq); entry = cfg->irq_2_pin; @@ -841,7 +838,6 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) best_guess = irq; } } - BUG_ON(best_guess >= nr_irqs); return best_guess; } @@ -973,7 +969,6 @@ static int pin_2_irq(int idx, int apic, int pin) irq += nr_ioapic_registers[i++]; irq += pin; } - BUG_ON(irq >= nr_irqs); return irq; } @@ -1008,7 +1003,6 @@ static int __assign_irq_vector(int irq, cpumask_t mask) int cpu; struct irq_cfg *cfg; - BUG_ON((unsigned)irq >= nr_irqs); cfg = irq_cfg(irq); /* Only try and allocate irqs on cpus that are present */ @@ -1082,7 +1076,6 @@ static void __clear_irq_vector(int irq) cpumask_t mask; int cpu, vector; - BUG_ON((unsigned)irq >= nr_irqs); cfg = irq_cfg(irq); BUG_ON(!cfg->vector); @@ -1924,8 +1917,6 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) struct irq_desc *desc; struct irq_cfg *cfg; irq = __get_cpu_var(vector_irq)[vector]; - if (irq >= nr_irqs) - continue; desc = irq_to_desc(irq); cfg = irq_cfg(irq); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index b3cf55e325f..a3e36336d91 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -202,7 +202,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs) stack_overflow_check(regs); #endif - if (likely(irq < nr_irqs)) + if (likely(__irq_to_desc(irq))) generic_handle_irq(irq); else { if (!disable_apic) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 2aa3d4b2fce..a4bb0da9c88 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -27,13 +27,13 @@ void dynamic_irq_init(unsigned int irq) struct irq_desc *desc; unsigned long flags; - if (irq >= nr_irqs) { + desc = irq_to_desc(irq); + if (!desc) { WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); return; } /* Ensure we don't have left over values from a previous use of this irq */ - desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); desc->status = IRQ_DISABLED; desc->chip = &no_irq_chip; @@ -60,12 +60,12 @@ void dynamic_irq_cleanup(unsigned int irq) struct irq_desc *desc; unsigned long flags; - if (irq >= nr_irqs) { + desc = __irq_to_desc(irq); + if (!desc) { WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq); return; } - desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); if (desc->action) { spin_unlock_irqrestore(&desc->lock, flags); @@ -92,7 +92,8 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip) struct irq_desc *desc; unsigned long flags; - if (irq >= nr_irqs) { + desc = __irq_to_desc(irq); + if (!desc) { WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq); return -EINVAL; } @@ -121,12 +122,12 @@ int set_irq_type(unsigned int irq, unsigned int type) unsigned long flags; int ret = -ENXIO; - if (irq >= nr_irqs) { + desc = __irq_to_desc(irq); + if (!desc) { printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq); return -ENODEV; } - desc = irq_to_desc(irq); if (type == IRQ_TYPE_NONE) return 0; @@ -149,13 +150,13 @@ int set_irq_data(unsigned int irq, void *data) struct irq_desc *desc; unsigned long flags; - if (irq >= nr_irqs) { + desc = __irq_to_desc(irq); + if (!desc) { printk(KERN_ERR "Trying to install controller data for IRQ%d\n", irq); return -EINVAL; } - desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); desc->handler_data = data; spin_unlock_irqrestore(&desc->lock, flags); @@ -175,12 +176,13 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry) struct irq_desc *desc; unsigned long flags; - if (irq >= nr_irqs) { + desc = __irq_to_desc(irq); + if (!desc) { printk(KERN_ERR "Trying to install msi data for IRQ%d\n", irq); return -EINVAL; } - desc = irq_to_desc(irq); + spin_lock_irqsave(&desc->lock, flags); desc->msi_desc = entry; if (entry) @@ -201,8 +203,14 @@ int set_irq_chip_data(unsigned int irq, void *data) struct irq_desc *desc; unsigned long flags; - desc = irq_to_desc(irq); - if (irq >= nr_irqs || !desc->chip) { + desc = __irq_to_desc(irq); + if (!desc) { + printk(KERN_ERR + "Trying to install chip data for IRQ%d\n", irq); + return -EINVAL; + } + + if (!desc->chip) { printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); return -EINVAL; } @@ -546,14 +554,13 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, struct irq_desc *desc; unsigned long flags; - if (irq >= nr_irqs) { + desc = __irq_to_desc(irq); + if (!desc) { printk(KERN_ERR "Trying to install type control for IRQ%d\n", irq); return; } - desc = irq_to_desc(irq); - if (!handle) handle = handle_bad_irq; else if (desc->chip == &no_irq_chip) { @@ -611,14 +618,13 @@ void __init set_irq_noprobe(unsigned int irq) struct irq_desc *desc; unsigned long flags; - if (irq >= nr_irqs) { + desc = __irq_to_desc(irq); + if (!desc) { printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq); return; } - desc = irq_to_desc(irq); - spin_lock_irqsave(&desc->lock, flags); desc->status |= IRQ_NOPROBE; spin_unlock_irqrestore(&desc->lock, flags); @@ -629,14 +635,13 @@ void __init set_irq_probe(unsigned int irq) struct irq_desc *desc; unsigned long flags; - if (irq >= nr_irqs) { + desc = __irq_to_desc(irq); + if (!desc) { printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq); return; } - desc = irq_to_desc(irq); - spin_lock_irqsave(&desc->lock, flags); desc->status &= ~IRQ_NOPROBE; spin_unlock_irqrestore(&desc->lock, flags); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5070f55fdc1..c0b4d4df6de 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -31,10 +31,10 @@ cpumask_t irq_default_affinity = CPU_MASK_ALL; */ void synchronize_irq(unsigned int irq) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc = __irq_to_desc(irq); unsigned int status; - if (irq >= nr_irqs) + if (!desc) return; do { @@ -142,10 +142,11 @@ int irq_select_affinity(unsigned int irq) */ void disable_irq_nosync(unsigned int irq) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc; unsigned long flags; - if (irq >= nr_irqs) + desc = __irq_to_desc(irq); + if (!desc) return; spin_lock_irqsave(&desc->lock, flags); @@ -171,9 +172,10 @@ EXPORT_SYMBOL(disable_irq_nosync); */ void disable_irq(unsigned int irq) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc; - if (irq >= nr_irqs) + desc = __irq_to_desc(irq); + if (!desc) return; disable_irq_nosync(irq); @@ -213,10 +215,11 @@ static void __enable_irq(struct irq_desc *desc, unsigned int irq) */ void enable_irq(unsigned int irq) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc; unsigned long flags; - if (irq >= nr_irqs) + desc = __irq_to_desc(irq); + if (!desc) return; spin_lock_irqsave(&desc->lock, flags); @@ -290,10 +293,14 @@ EXPORT_SYMBOL(set_irq_wake); */ int can_request_irq(unsigned int irq, unsigned long irqflags) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc; struct irqaction *action; - if (irq >= nr_irqs || desc->status & IRQ_NOREQUEST) + desc = __irq_to_desc(irq); + if (!desc) + return 0; + + if (desc->status & IRQ_NOREQUEST) return 0; action = desc->action; @@ -352,14 +359,15 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, */ int setup_irq(unsigned int irq, struct irqaction *new) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc; struct irqaction *old, **p; const char *old_name = NULL; unsigned long flags; int shared = 0; int ret; - if (irq >= nr_irqs) + desc = __irq_to_desc(irq); + if (!desc) return -EINVAL; if (desc->chip == &no_irq_chip) @@ -518,10 +526,11 @@ void free_irq(unsigned int irq, void *dev_id) unsigned long flags; WARN_ON(in_interrupt()); - if (irq >= nr_irqs) + + desc = __irq_to_desc(irq); + if (!desc) return; - desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); p = &desc->action; for (;;) { @@ -634,9 +643,11 @@ int request_irq(unsigned int irq, irq_handler_t handler, */ if ((irqflags & IRQF_SHARED) && !dev_id) return -EINVAL; - if (irq >= nr_irqs) + + desc = __irq_to_desc(irq); + if (!desc) return -EINVAL; - desc = irq_to_desc(irq); + if (desc->status & IRQ_NOREQUEST) return -EINVAL; if (!handler) -- cgit v1.2.3-70-g09d2 From 46926b67fc663d357a1a8174328998a9e49da0b8 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:15 -0700 Subject: generic: add irq_desc in function in parameter So we could remove some duplicated calling to irq_desc v2: make sure irq_desc in init/main.c is not used without generic_hardirqs Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq_64.c | 6 ++++-- include/linux/irq.h | 9 ++++++--- init/main.c | 7 +++++++ kernel/irq/handle.c | 30 ++++++++++++++++++++++++++++++ 4 files changed, 47 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index a3e36336d91..f58b995b30e 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -189,6 +189,7 @@ u64 arch_irq_stat(void) asmlinkage unsigned int do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); + struct irq_desc *desc; /* high bit used in ret_from_ code */ unsigned vector = ~regs->orig_ax; @@ -202,8 +203,9 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs) stack_overflow_check(regs); #endif - if (likely(__irq_to_desc(irq))) - generic_handle_irq(irq); + desc = __irq_to_desc(irq); + if (likely(desc)) + generic_handle_irq_desc(irq, desc); else { if (!disable_apic) ack_APIC_irq(); diff --git a/include/linux/irq.h b/include/linux/irq.h index 9de16ca8b8e..7b59e193a11 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -315,10 +315,8 @@ extern unsigned int __do_IRQ(unsigned int irq); * irqchip-style controller then we call the ->handle_irq() handler, * and it calls __do_IRQ() if it's attached to an irqtype-style controller. */ -static inline void generic_handle_irq(unsigned int irq) +static inline void generic_handle_irq_desc(unsigned int irq, struct irq_desc *desc) { - struct irq_desc *desc = irq_to_desc(irq); - #ifdef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ desc->handle_irq(irq, desc); #else @@ -329,6 +327,11 @@ static inline void generic_handle_irq(unsigned int irq) #endif } +static inline void generic_handle_irq(unsigned int irq) +{ + generic_handle_irq_desc(irq, irq_to_desc(irq)); +} + /* Handling of unhandled and spurious interrupts: */ extern void note_interrupt(unsigned int irq, struct irq_desc *desc, int action_ret); diff --git a/init/main.c b/init/main.c index ab97d0877ac..0d2e60144f8 100644 --- a/init/main.c +++ b/init/main.c @@ -590,6 +590,13 @@ void pre_alloc_dyn_array(void) if (da->init_work) da->init_work(da); } +#else +#ifdef CONFIF_GENERIC_HARDIRQS + unsigned int i; + + for (i = 0; i < NR_IRQS; i++) + irq_desc[i].irq = i; +#endif #endif } diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 8e55dbe50af..e1d787e9169 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -197,6 +197,21 @@ struct irq_desc *irq_to_desc(unsigned int irq) * we run out of pre-allocate ones, allocate more */ printk(KERN_DEBUG "try to get more irq_desc %d\n", nr_irq_desc); + { + /* double check if some one mess up the list */ + struct irq_desc *desc; + int count = 0; + + desc = &sparse_irqs[0]; + while (desc) { + printk(KERN_DEBUG "found irq_desc for irq %d\n", desc->irq); + if (desc->next) + printk(KERN_DEBUG "found irq_desc for irq %d and next will be irq %d\n", desc->irq, desc->next->irq); + desc = desc->next; + count++; + } + printk(KERN_DEBUG "all preallocted %d\n", count); + } total_bytes = sizeof(struct irq_desc) * nr_irq_desc; if (after_bootmem) @@ -221,6 +236,21 @@ struct irq_desc *irq_to_desc(unsigned int irq) desc->irq = irq; desc_pri->next = desc; + { + /* double check if some one mess up the list */ + struct irq_desc *desc; + int count = 0; + + desc = &sparse_irqs[0]; + while (desc) { + printk(KERN_DEBUG "1 found irq_desc for irq %d\n", desc->irq); + if (desc->next) + printk(KERN_DEBUG "1 found irq_desc for irq %d and next will be irq %d\n", desc->irq, desc->next->irq); + desc = desc->next; + count++; + } + printk(KERN_DEBUG "1 all preallocted %d\n", count); + } return desc; } -- cgit v1.2.3-70-g09d2 From 1d5f6b36c4736af1dac396d6267eb53dcc8c0021 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:16 -0700 Subject: x86: check with without_new in show_interrupts so we don't get new one that we don't need it. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq_64.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index f58b995b30e..f337f87c1e1 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -83,7 +83,10 @@ int show_interrupts(struct seq_file *p, void *v) if (i < nr_irqs) { unsigned any_count = 0; - struct irq_desc *desc = irq_to_desc(i); + struct irq_desc *desc = __irq_to_desc(i); + + if (!desc) + return 0; spin_lock_irqsave(&desc->lock, flags); #ifndef CONFIG_SMP -- cgit v1.2.3-70-g09d2 From cb5bc83225a86ca53bbb889ed8439e4fd6cf44ac Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:17 -0700 Subject: x86_64: rename irq_desc/irq_desc_alloc change names: irq_desc() ==> irq_desc_alloc __irq_desc() ==> irq_desc Also split a few of the uses in lowlevel x86 code. v2: need to check if desc is null in smp_irq_move_cleanup Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_64.c | 10 +++++++++- arch/x86/kernel/irq_64.c | 4 ++-- arch/x86/kernel/irqinit_64.c | 3 ++- include/linux/irq.h | 2 +- kernel/irq/chip.c | 21 +++++++++++---------- kernel/irq/handle.c | 23 +++++------------------ kernel/irq/manage.c | 16 ++++++++-------- 7 files changed, 38 insertions(+), 41 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 1b8cccb5ba2..a054db9ef19 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -1124,7 +1124,12 @@ static void ioapic_register_intr(int irq, unsigned long trigger) { struct irq_desc *desc; - desc = irq_to_desc(irq); + /* first time to use this irq_desc */ + if (irq < 16) + desc = irq_to_desc(irq); + else + desc = irq_to_desc_alloc(irq); + if (trigger) desc->status |= IRQ_LEVEL; else @@ -1919,6 +1924,9 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) irq = __get_cpu_var(vector_irq)[vector]; desc = irq_to_desc(irq); + if (!desc) + continue; + cfg = irq_cfg(irq); spin_lock(&desc->lock); if (!cfg->move_cleanup_count) diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index f337f87c1e1..5d5976e0311 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -83,7 +83,7 @@ int show_interrupts(struct seq_file *p, void *v) if (i < nr_irqs) { unsigned any_count = 0; - struct irq_desc *desc = __irq_to_desc(i); + struct irq_desc *desc = irq_to_desc(i); if (!desc) return 0; @@ -206,7 +206,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs) stack_overflow_check(regs); #endif - desc = __irq_to_desc(irq); + desc = irq_to_desc(irq); if (likely(desc)) generic_handle_irq_desc(irq, desc); else { diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index cd9f42d028d..d17fbc26d96 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -143,7 +143,8 @@ void __init init_ISA_irqs(void) init_8259A(0); for (i = 0; i < 16; i++) { - struct irq_desc *desc = irq_to_desc(i); + /* first time call this irq_desc */ + struct irq_desc *desc = irq_to_desc_alloc(i); desc->status = IRQ_DISABLED; desc->action = NULL; diff --git a/include/linux/irq.h b/include/linux/irq.h index 7b59e193a11..5fe1b01c11f 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -191,7 +191,7 @@ struct irq_desc { } ____cacheline_internodealigned_in_smp; extern struct irq_desc *irq_to_desc(unsigned int irq); -extern struct irq_desc *__irq_to_desc(unsigned int irq); +extern struct irq_desc *irq_to_desc_alloc(unsigned int irq); #ifndef CONFIG_HAVE_SPARSE_IRQ diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index a4bb0da9c88..9fc5e69213d 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -27,7 +27,8 @@ void dynamic_irq_init(unsigned int irq) struct irq_desc *desc; unsigned long flags; - desc = irq_to_desc(irq); + /* first time to use this irq_desc */ + desc = irq_to_desc_alloc(irq); if (!desc) { WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); return; @@ -60,7 +61,7 @@ void dynamic_irq_cleanup(unsigned int irq) struct irq_desc *desc; unsigned long flags; - desc = __irq_to_desc(irq); + desc = irq_to_desc(irq); if (!desc) { WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq); return; @@ -92,7 +93,7 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip) struct irq_desc *desc; unsigned long flags; - desc = __irq_to_desc(irq); + desc = irq_to_desc(irq); if (!desc) { WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq); return -EINVAL; @@ -122,7 +123,7 @@ int set_irq_type(unsigned int irq, unsigned int type) unsigned long flags; int ret = -ENXIO; - desc = __irq_to_desc(irq); + desc = irq_to_desc(irq); if (!desc) { printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq); return -ENODEV; @@ -150,7 +151,7 @@ int set_irq_data(unsigned int irq, void *data) struct irq_desc *desc; unsigned long flags; - desc = __irq_to_desc(irq); + desc = irq_to_desc(irq); if (!desc) { printk(KERN_ERR "Trying to install controller data for IRQ%d\n", irq); @@ -176,7 +177,7 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry) struct irq_desc *desc; unsigned long flags; - desc = __irq_to_desc(irq); + desc = irq_to_desc(irq); if (!desc) { printk(KERN_ERR "Trying to install msi data for IRQ%d\n", irq); @@ -203,7 +204,7 @@ int set_irq_chip_data(unsigned int irq, void *data) struct irq_desc *desc; unsigned long flags; - desc = __irq_to_desc(irq); + desc = irq_to_desc(irq); if (!desc) { printk(KERN_ERR "Trying to install chip data for IRQ%d\n", irq); @@ -554,7 +555,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, struct irq_desc *desc; unsigned long flags; - desc = __irq_to_desc(irq); + desc = irq_to_desc(irq); if (!desc) { printk(KERN_ERR "Trying to install type control for IRQ%d\n", irq); @@ -618,7 +619,7 @@ void __init set_irq_noprobe(unsigned int irq) struct irq_desc *desc; unsigned long flags; - desc = __irq_to_desc(irq); + desc = irq_to_desc(irq); if (!desc) { printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq); @@ -635,7 +636,7 @@ void __init set_irq_probe(unsigned int irq) struct irq_desc *desc; unsigned long flags; - desc = __irq_to_desc(irq); + desc = irq_to_desc(irq); if (!desc) { printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index e1d787e9169..d44e3515eae 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -151,7 +151,7 @@ early_param("nr_irq_desc", parse_nr_irq_desc); struct irq_desc *sparse_irqs; DEFINE_DYN_ARRAY(sparse_irqs, sizeof(struct irq_desc), nr_irq_desc, PAGE_SIZE, init_work); -struct irq_desc *__irq_to_desc(unsigned int irq) +struct irq_desc *irq_to_desc(unsigned int irq) { struct irq_desc *desc; @@ -169,7 +169,7 @@ struct irq_desc *__irq_to_desc(unsigned int irq) } return NULL; } -struct irq_desc *irq_to_desc(unsigned int irq) +struct irq_desc *irq_to_desc_alloc(unsigned int irq) { struct irq_desc *desc, *desc_pri; int i; @@ -186,6 +186,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) if (desc->irq == -1U) { desc->irq = irq; + printk(KERN_DEBUG "found new irq_desc for irq %d\n", desc->irq); return desc; } desc_pri = desc; @@ -236,21 +237,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) desc->irq = irq; desc_pri->next = desc; - { - /* double check if some one mess up the list */ - struct irq_desc *desc; - int count = 0; - - desc = &sparse_irqs[0]; - while (desc) { - printk(KERN_DEBUG "1 found irq_desc for irq %d\n", desc->irq); - if (desc->next) - printk(KERN_DEBUG "1 found irq_desc for irq %d and next will be irq %d\n", desc->irq, desc->next->irq); - desc = desc->next; - count++; - } - printk(KERN_DEBUG "1 all preallocted %d\n", count); - } + printk(KERN_DEBUG "1 found new irq_desc for irq %d and pri will be irq %d\n", desc->irq, desc_pri->irq); return desc; } @@ -285,7 +272,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) return NULL; } -struct irq_desc *__irq_to_desc(unsigned int irq) +struct irq_desc *irq_to_desc_alloc(unsigned int irq) { return irq_to_desc(irq); } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index c0b4d4df6de..6df49218632 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -31,7 +31,7 @@ cpumask_t irq_default_affinity = CPU_MASK_ALL; */ void synchronize_irq(unsigned int irq) { - struct irq_desc *desc = __irq_to_desc(irq); + struct irq_desc *desc = irq_to_desc(irq); unsigned int status; if (!desc) @@ -145,7 +145,7 @@ void disable_irq_nosync(unsigned int irq) struct irq_desc *desc; unsigned long flags; - desc = __irq_to_desc(irq); + desc = irq_to_desc(irq); if (!desc) return; @@ -174,7 +174,7 @@ void disable_irq(unsigned int irq) { struct irq_desc *desc; - desc = __irq_to_desc(irq); + desc = irq_to_desc(irq); if (!desc) return; @@ -218,7 +218,7 @@ void enable_irq(unsigned int irq) struct irq_desc *desc; unsigned long flags; - desc = __irq_to_desc(irq); + desc = irq_to_desc(irq); if (!desc) return; @@ -296,7 +296,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags) struct irq_desc *desc; struct irqaction *action; - desc = __irq_to_desc(irq); + desc = irq_to_desc(irq); if (!desc) return 0; @@ -366,7 +366,7 @@ int setup_irq(unsigned int irq, struct irqaction *new) int shared = 0; int ret; - desc = __irq_to_desc(irq); + desc = irq_to_desc(irq); if (!desc) return -EINVAL; @@ -527,7 +527,7 @@ void free_irq(unsigned int irq, void *dev_id) WARN_ON(in_interrupt()); - desc = __irq_to_desc(irq); + desc = irq_to_desc(irq); if (!desc) return; @@ -644,7 +644,7 @@ int request_irq(unsigned int irq, irq_handler_t handler, if ((irqflags & IRQF_SHARED) && !dev_id) return -EINVAL; - desc = __irq_to_desc(irq); + desc = irq_to_desc(irq); if (!desc) return -EINVAL; -- cgit v1.2.3-70-g09d2 From a2f9f43858db64cb8b45c4f6746d7a52b80d4dcb Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:19 -0700 Subject: x86_64: separate irq_cfgx from irq_cfgx_free so later don't need to compare with -1U Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_64.c | 92 ++++++++++++++++++++++++++++---------------- 1 file changed, 59 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index a054db9ef19..8ab7ae01773 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -111,44 +111,44 @@ static void init_one_irq_cfg(struct irq_cfg *cfg) memcpy(cfg, &irq_cfg_init, sizeof(struct irq_cfg)); } +static struct irq_cfg *irq_cfgx; +static struct irq_cfg *irq_cfgx_free; static void __init init_work(void *data) { struct dyn_array *da = data; struct irq_cfg *cfg; + int legacy_count; int i; cfg = *da->name; memcpy(cfg, irq_cfg_legacy, sizeof(irq_cfg_legacy)); - i = sizeof(irq_cfg_legacy)/sizeof(irq_cfg_legacy[0]); - for (; i < *da->nr; i++) + legacy_count = sizeof(irq_cfg_legacy)/sizeof(irq_cfg_legacy[0]); + for (i = legacy_count; i < *da->nr; i++) init_one_irq_cfg(&cfg[i]); for (i = 1; i < *da->nr; i++) cfg[i-1].next = &cfg[i]; + + irq_cfgx_free = &irq_cfgx[legacy_count]; + irq_cfgx[legacy_count - 1].next = NULL; } #define for_each_irq_cfg(cfg) \ - for (cfg = irq_cfgx; cfg && cfg->irq != -1U; cfg = cfg->next) + for (cfg = irq_cfgx; cfg; cfg = cfg->next) -static struct irq_cfg *irq_cfgx; DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irq_cfg, PAGE_SIZE, init_work); static struct irq_cfg *irq_cfg(unsigned int irq) { struct irq_cfg *cfg; - BUG_ON(irq == -1U); - - cfg = &irq_cfgx[0]; + cfg = irq_cfgx; while (cfg) { if (cfg->irq == irq) return cfg; - if (cfg->irq == -1U) - return NULL; - cfg = cfg->next; } @@ -161,44 +161,70 @@ static struct irq_cfg *irq_cfg_alloc(unsigned int irq) int i; int count = 0; - BUG_ON(irq == -1U); - - cfg_pri = cfg = &irq_cfgx[0]; + cfg_pri = cfg = irq_cfgx; while (cfg) { if (cfg->irq == irq) return cfg; - if (cfg->irq == -1U) { - cfg->irq = irq; - return cfg; - } cfg_pri = cfg; cfg = cfg->next; count++; } - /* - * we run out of pre-allocate ones, allocate more - */ - printk(KERN_DEBUG "try to get more irq_cfg %d\n", nr_irq_cfg); + if (!irq_cfgx_free) { + unsigned long phys; + unsigned long total_bytes; + /* + * we run out of pre-allocate ones, allocate more + */ + printk(KERN_DEBUG "try to get more irq_cfg %d\n", nr_irq_cfg); - if (after_bootmem) - cfg = kzalloc(sizeof(struct irq_cfg)*nr_irq_cfg, GFP_ATOMIC); - else - cfg = __alloc_bootmem_nopanic(sizeof(struct irq_cfg)*nr_irq_cfg, PAGE_SIZE, 0); + total_bytes = sizeof(struct irq_cfg) * nr_irq_cfg; + if (after_bootmem) + cfg = kzalloc(total_bytes, GFP_ATOMIC); + else + cfg = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0); - if (!cfg) - panic("please boot with nr_irq_cfg= %d\n", count * 2); + if (!cfg) + panic("please boot with nr_irq_cfg= %d\n", count * 2); - for (i = 0; i < nr_irq_cfg; i++) - init_one_irq_cfg(&cfg[i]); + phys = __pa(cfg); + printk(KERN_DEBUG "irq_irq ==> [%#lx - %#lx]\n", phys, phys + total_bytes); - for (i = 1; i < nr_irq_cfg; i++) - cfg[i-1].next = &cfg[i]; + for (i = 0; i < nr_irq_cfg; i++) + init_one_irq_cfg(&cfg[i]); - cfg->irq = irq; - cfg_pri->next = cfg; + for (i = 1; i < nr_irq_cfg; i++) + cfg[i-1].next = &cfg[i]; + + irq_cfgx_free = cfg; + } + cfg = irq_cfgx_free; + irq_cfgx_free = irq_cfgx_free->next; + cfg->next = NULL; + if (cfg_pri) + cfg_pri->next = cfg; + else + irq_cfgx = cfg; + cfg->irq = irq; + printk(KERN_DEBUG "found new irq_cfg for irq %d\n", cfg->irq); +#ifdef CONFIG_HAVE_SPARSE_IRQ_DEBUG + { + /* dump the results */ + struct irq_cfg *cfg; + unsigned long phys; + unsigned long bytes = sizeof(struct irq_cfg); + + printk(KERN_DEBUG "=========================== %d\n", irq); + printk(KERN_DEBUG "irq_cfg dump after get that for %d\n", irq); + for_each_irq_cfg(cfg) { + phys = __pa(cfg); + printk(KERN_DEBUG "irq_cfg %d ==> [%#lx - %#lx]\n", cfg->irq, phys, phys + bytes); + } + printk(KERN_DEBUG "===========================\n"); + } +#endif return cfg; } -- cgit v1.2.3-70-g09d2 From 52b17329d6d0a4824b89206803a430915031ff23 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:20 -0700 Subject: x86_64: make /proc/interrupts work with dyn irq_desc loop with irq_desc list Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq_64.c | 31 ++++++++++++++++++++++++------- fs/proc/proc_misc.c | 28 ++++++++++++++++++++++++---- 2 files changed, 48 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 5d5976e0311..7bd841a9c64 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -70,9 +70,27 @@ static inline void stack_overflow_check(struct pt_regs *regs) int show_interrupts(struct seq_file *p, void *v) { - int i = *(loff_t *) v, j; + int i, j; struct irqaction * action; unsigned long flags; + unsigned int entries; + struct irq_desc *desc; + int tail = 0; + +#ifdef CONFIG_HAVE_SPARSE_IRQ + desc = (struct irq_desc *)v; + entries = -1U; + i = desc->irq; + if (!desc->next) + tail = 1; +#else + entries = nr_irqs - 1; + i = *(loff_t *) v; + if (i == nr_irqs) + tail = 1; + else + desc = irq_to_desc(i); +#endif if (i == 0) { seq_printf(p, " "); @@ -81,12 +99,8 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); } - if (i < nr_irqs) { + if (i <= entries) { unsigned any_count = 0; - struct irq_desc *desc = irq_to_desc(i); - - if (!desc) - return 0; spin_lock_irqsave(&desc->lock, flags); #ifndef CONFIG_SMP @@ -116,7 +130,9 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); skip: spin_unlock_irqrestore(&desc->lock, flags); - } else if (i == nr_irqs) { + } + + if (tail) { seq_printf(p, "NMI: "); for_each_online_cpu(j) seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count); @@ -155,6 +171,7 @@ skip: seq_printf(p, " Spurious interrupts\n"); seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); } + return 0; } diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index c3cbabe8b38..72dd739a7f8 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -645,15 +645,36 @@ static const struct file_operations proc_stat_operations = { */ static void *int_seq_start(struct seq_file *f, loff_t *pos) { +#ifdef CONFIG_HAVE_SPARSE_IRQ + struct irq_desc *desc; + int irq; + int count = *pos; + + for_each_irq_desc(irq, desc) { + if (count-- == 0) + return desc; + } + + return NULL; +#else return (*pos <= nr_irqs) ? pos : NULL; +#endif } + static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos) { +#ifdef CONFIG_HAVE_SPARSE_IRQ + struct irq_desc *desc; + + desc = ((struct irq_desc *)v)->next; (*pos)++; - if (*pos > nr_irqs) - return NULL; - return pos; + + return desc; +#else + (*pos)++; + return (*pos <= nr_irqs) ? pos : NULL; +#endif } static void int_seq_stop(struct seq_file *f, void *v) @@ -661,7 +682,6 @@ static void int_seq_stop(struct seq_file *f, void *v) /* Nothing to do */ } - static const struct seq_operations int_seq_ops = { .start = int_seq_start, .next = int_seq_next, -- cgit v1.2.3-70-g09d2 From 6d50bc26836e16a9589e0b128d527c29e30d722a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:22 -0700 Subject: x86: use 28 bits irq NR for pci msi/msix and ht also print out irq no in /proc/interrups and /proc/stat in hex, so could tell bus/dev/func. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_64.c | 64 ++++++++++++++++++++++++++++++++++---------- arch/x86/kernel/irq_64.c | 2 +- drivers/pci/htirq.c | 22 +++++++++++++-- fs/proc/proc_misc.c | 2 +- include/linux/irq.h | 1 + 5 files changed, 73 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 8ab7ae01773..b0d4abc55a1 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -2520,17 +2520,21 @@ device_initcall(ioapic_init_sysfs); /* * Dynamic irq allocate and deallocation */ -int create_irq(void) +unsigned int create_irq_nr(unsigned int irq_want) { /* Allocate an unused irq */ - int irq; - int new; + unsigned int irq; + unsigned int new; unsigned long flags; struct irq_cfg *cfg_new; - irq = -ENOSPC; +#ifndef CONFIG_HAVE_SPARSE_IRQ + irq_want = nr_irqs - 1; +#endif + + irq = 0; spin_lock_irqsave(&vector_lock, flags); - for (new = (nr_irqs - 1); new >= 0; new--) { + for (new = irq_want; new > 0; new--) { if (platform_legacy_irq(new)) continue; cfg_new = irq_cfg(new); @@ -2545,12 +2549,24 @@ int create_irq(void) } spin_unlock_irqrestore(&vector_lock, flags); - if (irq >= 0) { + if (irq > 0) { dynamic_irq_init(irq); } return irq; } +int create_irq(void) +{ + int irq; + + irq = create_irq_nr(nr_irqs - 1); + + if (irq == 0) + irq = -1; + + return irq; +} + void destroy_irq(unsigned int irq) { unsigned long flags; @@ -2803,13 +2819,29 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) return 0; } +static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) +{ + unsigned int irq; + + irq = dev->bus->number; + irq <<= 8; + irq |= dev->devfn; + irq <<= 12; + + return irq; +} + int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) { - int irq, ret; + unsigned int irq; + int ret; + unsigned int irq_want; - irq = create_irq(); - if (irq < 0) - return irq; + irq_want = build_irq_for_pci_dev(dev) + 0x100; + + irq = create_irq_nr(irq_want); + if (irq == 0) + return -1; #ifdef CONFIG_INTR_REMAP if (!intr_remapping_enabled) @@ -2836,18 +2868,22 @@ error: int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { - int irq, ret, sub_handle; + unsigned int irq; + int ret, sub_handle; struct msi_desc *desc; + unsigned int irq_want; + #ifdef CONFIG_INTR_REMAP struct intel_iommu *iommu = 0; int index = 0; #endif + irq_want = build_irq_for_pci_dev(dev) + 0x100; sub_handle = 0; list_for_each_entry(desc, &dev->msi_list, list) { - irq = create_irq(); - if (irq < 0) - return irq; + irq = create_irq_nr(irq_want--); + if (irq == 0) + return -1; #ifdef CONFIG_INTR_REMAP if (!intr_remapping_enabled) goto no_ir; diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 7bd841a9c64..348a11168c2 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -112,7 +112,7 @@ int show_interrupts(struct seq_file *p, void *v) action = desc->action; if (!action && !any_count) goto skip; - seq_printf(p, "%3d: ",i); + seq_printf(p, "%#x: ",i); #ifndef CONFIG_SMP seq_printf(p, "%10u ", kstat_irqs(i)); #else diff --git a/drivers/pci/htirq.c b/drivers/pci/htirq.c index 279c940a003..7c5aef13fcd 100644 --- a/drivers/pci/htirq.c +++ b/drivers/pci/htirq.c @@ -82,6 +82,18 @@ void unmask_ht_irq(unsigned int irq) write_ht_irq_msg(irq, &msg); } +static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) +{ + unsigned int irq; + + irq = dev->bus->number; + irq <<= 8; + irq |= dev->devfn; + irq <<= 12; + + return irq; +} + /** * __ht_create_irq - create an irq and attach it to a device. * @dev: The hypertransport device to find the irq capability on. @@ -97,7 +109,8 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update) u32 data; int max_irq; int pos; - int irq; + unsigned int irq; + unsigned int irq_want; pos = pci_find_ht_capability(dev, HT_CAPTYPE_IRQ); if (!pos) @@ -125,8 +138,13 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update) cfg->msg.address_lo = 0xffffffff; cfg->msg.address_hi = 0xffffffff; + irq_want= build_irq_for_pci_dev(dev); +#ifdef CONFIG_HAVE_SPARSE_IRQ + irq = create_irq_nr(irq_want + idx); +#else irq = create_irq(); - if (irq < 0) { +#endif + if (irq == 0) { kfree(cfg); return -EBUSY; } diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 72dd739a7f8..d68c3592fe4 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -589,7 +589,7 @@ static int show_stat(struct seq_file *p, void *v) } #ifdef CONFIG_HAVE_SPARSE_IRQ - seq_printf(p, " %u:%u", j, per_irq_sum); + seq_printf(p, " %#x:%u", j, per_irq_sum); #else seq_printf(p, " %u", per_irq_sum); #endif diff --git a/include/linux/irq.h b/include/linux/irq.h index 788d5a35a58..704136138dc 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -399,6 +399,7 @@ extern void set_irq_noprobe(unsigned int irq); extern void set_irq_probe(unsigned int irq); /* Handle dynamic irq creation and destruction */ +extern unsigned int create_irq_nr(unsigned int irq_want); extern int create_irq(void); extern void destroy_irq(unsigned int irq); -- cgit v1.2.3-70-g09d2 From 8b8e8c1bf7275eca859fe551dfa484134eaf013b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:23 -0700 Subject: x86: remove irqbalance in kernel for 32 bit This has been deprecated for years, the user space irqbalanced utility works better with numa, has configurable policies, etc... Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 8 - arch/x86/configs/i386_defconfig | 1 - arch/x86/kernel/io_apic_32.c | 402 ---------------------------------------- arch/x86/kernel/quirks.c | 3 - include/linux/irq.h | 14 +- kernel/irq/manage.c | 3 - 6 files changed, 3 insertions(+), 428 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1004888e9b1..3e0eaaa1a33 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1254,14 +1254,6 @@ config EFI resultant kernel should continue to boot on existing non-EFI platforms. -config IRQBALANCE - def_bool y - prompt "Enable kernel irq balancing" - depends on X86_32 && SMP && X86_IO_APIC - help - The default yes will allow the kernel to do irq load balancing. - Saying no will keep the kernel from doing irq load balancing. - config SECCOMP def_bool y prompt "Enable seccomp to safely compute untrusted bytecode" diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 52d0359719d..13b8c86ae98 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -287,7 +287,6 @@ CONFIG_MTRR=y # CONFIG_MTRR_SANITIZER is not set CONFIG_X86_PAT=y CONFIG_EFI=y -# CONFIG_IRQBALANCE is not set CONFIG_SECCOMP=y # CONFIG_HZ_100 is not set # CONFIG_HZ_250 is not set diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 204884b1415..668edf22606 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -371,408 +371,6 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) spin_unlock_irqrestore(&ioapic_lock, flags); } -#if defined(CONFIG_IRQBALANCE) -# include /* kernel_thread() */ -# include /* kstat */ -# include /* kmalloc() */ -# include - -#define IRQBALANCE_CHECK_ARCH -999 -#define MAX_BALANCED_IRQ_INTERVAL (5*HZ) -#define MIN_BALANCED_IRQ_INTERVAL (HZ/2) -#define BALANCED_IRQ_MORE_DELTA (HZ/10) -#define BALANCED_IRQ_LESS_DELTA (HZ) - -static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH; -static int physical_balance __read_mostly; -static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL; - -static struct irq_cpu_info { - unsigned long *last_irq; - unsigned long *irq_delta; - unsigned long irq; -} irq_cpu_data[NR_CPUS]; - -#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq) -#define LAST_CPU_IRQ(cpu, irq) (irq_cpu_data[cpu].last_irq[irq]) -#define IRQ_DELTA(cpu, irq) (irq_cpu_data[cpu].irq_delta[irq]) - -#define IDLE_ENOUGH(cpu,now) \ - (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1)) - -#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask) - -#define CPU_TO_PACKAGEINDEX(i) (first_cpu(per_cpu(cpu_sibling_map, i))) - -static cpumask_t balance_irq_affinity_init __initdata = CPU_MASK_ALL; - -static cpumask_t *balance_irq_affinity; - - -static void __init irq_affinity_init_work(void *data) -{ - struct dyn_array *da = data; - - int i; - struct balance_irq_affinity *affinity; - - affinity = *da->name; - - for (i = 0; i < *da->nr; i++) - memcpy(&affinity[i], &balance_irq_affinity_init, - sizeof(struct balance_irq_affinity)); - -} - -DEFINE_DYN_ARRAY(balance_irq_affinity, sizeof(struct balance_irq_affinity), nr_irqs, PAGE_SIZE, irq_affinity_init_work); - - -void set_balance_irq_affinity(unsigned int irq, cpumask_t mask) -{ - balance_irq_affinity[irq] = mask; -} - -static unsigned long move(int curr_cpu, cpumask_t allowed_mask, - unsigned long now, int direction) -{ - int search_idle = 1; - int cpu = curr_cpu; - - goto inside; - - do { - if (unlikely(cpu == curr_cpu)) - search_idle = 0; -inside: - if (direction == 1) { - cpu++; - if (cpu >= NR_CPUS) - cpu = 0; - } else { - cpu--; - if (cpu == -1) - cpu = NR_CPUS-1; - } - } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) || - (search_idle && !IDLE_ENOUGH(cpu, now))); - - return cpu; -} - -static inline void balance_irq(int cpu, int irq) -{ - unsigned long now = jiffies; - cpumask_t allowed_mask; - unsigned int new_cpu; - - if (irqbalance_disabled) - return; - - cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]); - new_cpu = move(cpu, allowed_mask, now, 1); - if (cpu != new_cpu) - set_pending_irq(irq, cpumask_of_cpu(new_cpu)); -} - -static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) -{ - int i, j; - struct irq_desc *desc; - - for_each_online_cpu(i) { - for (j = 0; j < nr_irqs; j++) { - desc = irq_to_desc(j); - if (!desc->action) - continue; - /* Is it a significant load ? */ - if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) < - useful_load_threshold) - continue; - balance_irq(i, j); - } - } - balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, - balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); - return; -} - -static void do_irq_balance(void) -{ - int i, j; - unsigned long max_cpu_irq = 0, min_cpu_irq = (~0); - unsigned long move_this_load = 0; - int max_loaded = 0, min_loaded = 0; - int load; - unsigned long useful_load_threshold = balanced_irq_interval + 10; - int selected_irq; - int tmp_loaded, first_attempt = 1; - unsigned long tmp_cpu_irq; - unsigned long imbalance = 0; - cpumask_t allowed_mask, target_cpu_mask, tmp; - struct irq_desc *desc; - - for_each_possible_cpu(i) { - int package_index; - CPU_IRQ(i) = 0; - if (!cpu_online(i)) - continue; - package_index = CPU_TO_PACKAGEINDEX(i); - for (j = 0; j < nr_irqs; j++) { - unsigned long value_now, delta; - /* Is this an active IRQ or balancing disabled ? */ - desc = irq_to_desc(j); - if (!desc->action || irq_balancing_disabled(j)) - continue; - if (package_index == i) - IRQ_DELTA(package_index, j) = 0; - /* Determine the total count per processor per IRQ */ - value_now = (unsigned long) kstat_irqs_cpu(j, i); - - /* Determine the activity per processor per IRQ */ - delta = value_now - LAST_CPU_IRQ(i, j); - - /* Update last_cpu_irq[][] for the next time */ - LAST_CPU_IRQ(i, j) = value_now; - - /* Ignore IRQs whose rate is less than the clock */ - if (delta < useful_load_threshold) - continue; - /* update the load for the processor or package total */ - IRQ_DELTA(package_index, j) += delta; - - /* Keep track of the higher numbered sibling as well */ - if (i != package_index) - CPU_IRQ(i) += delta; - /* - * We have sibling A and sibling B in the package - * - * cpu_irq[A] = load for cpu A + load for cpu B - * cpu_irq[B] = load for cpu B - */ - CPU_IRQ(package_index) += delta; - } - } - /* Find the least loaded processor package */ - for_each_online_cpu(i) { - if (i != CPU_TO_PACKAGEINDEX(i)) - continue; - if (min_cpu_irq > CPU_IRQ(i)) { - min_cpu_irq = CPU_IRQ(i); - min_loaded = i; - } - } - max_cpu_irq = ULONG_MAX; - -tryanothercpu: - /* - * Look for heaviest loaded processor. - * We may come back to get the next heaviest loaded processor. - * Skip processors with trivial loads. - */ - tmp_cpu_irq = 0; - tmp_loaded = -1; - for_each_online_cpu(i) { - if (i != CPU_TO_PACKAGEINDEX(i)) - continue; - if (max_cpu_irq <= CPU_IRQ(i)) - continue; - if (tmp_cpu_irq < CPU_IRQ(i)) { - tmp_cpu_irq = CPU_IRQ(i); - tmp_loaded = i; - } - } - - if (tmp_loaded == -1) { - /* - * In the case of small number of heavy interrupt sources, - * loading some of the cpus too much. We use Ingo's original - * approach to rotate them around. - */ - if (!first_attempt && imbalance >= useful_load_threshold) { - rotate_irqs_among_cpus(useful_load_threshold); - return; - } - goto not_worth_the_effort; - } - - first_attempt = 0; /* heaviest search */ - max_cpu_irq = tmp_cpu_irq; /* load */ - max_loaded = tmp_loaded; /* processor */ - imbalance = (max_cpu_irq - min_cpu_irq) / 2; - - /* - * if imbalance is less than approx 10% of max load, then - * observe diminishing returns action. - quit - */ - if (imbalance < (max_cpu_irq >> 3)) - goto not_worth_the_effort; - -tryanotherirq: - /* if we select an IRQ to move that can't go where we want, then - * see if there is another one to try. - */ - move_this_load = 0; - selected_irq = -1; - for (j = 0; j < nr_irqs; j++) { - /* Is this an active IRQ? */ - desc = irq_to_desc(j); - if (!desc->action) - continue; - if (imbalance <= IRQ_DELTA(max_loaded, j)) - continue; - /* Try to find the IRQ that is closest to the imbalance - * without going over. - */ - if (move_this_load < IRQ_DELTA(max_loaded, j)) { - move_this_load = IRQ_DELTA(max_loaded, j); - selected_irq = j; - } - } - if (selected_irq == -1) - goto tryanothercpu; - - imbalance = move_this_load; - - /* For physical_balance case, we accumulated both load - * values in the one of the siblings cpu_irq[], - * to use the same code for physical and logical processors - * as much as possible. - * - * NOTE: the cpu_irq[] array holds the sum of the load for - * sibling A and sibling B in the slot for the lowest numbered - * sibling (A), _AND_ the load for sibling B in the slot for - * the higher numbered sibling. - * - * We seek the least loaded sibling by making the comparison - * (A+B)/2 vs B - */ - load = CPU_IRQ(min_loaded) >> 1; - for_each_cpu_mask(j, per_cpu(cpu_sibling_map, min_loaded)) { - if (load > CPU_IRQ(j)) { - /* This won't change cpu_sibling_map[min_loaded] */ - load = CPU_IRQ(j); - min_loaded = j; - } - } - - cpus_and(allowed_mask, - cpu_online_map, - balance_irq_affinity[selected_irq]); - target_cpu_mask = cpumask_of_cpu(min_loaded); - cpus_and(tmp, target_cpu_mask, allowed_mask); - - if (!cpus_empty(tmp)) { - /* mark for change destination */ - set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded)); - - /* Since we made a change, come back sooner to - * check for more variation. - */ - balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, - balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); - return; - } - goto tryanotherirq; - -not_worth_the_effort: - /* - * if we did not find an IRQ to move, then adjust the time interval - * upward - */ - balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL, - balanced_irq_interval + BALANCED_IRQ_MORE_DELTA); - return; -} - -static int balanced_irq(void *unused) -{ - int i; - unsigned long prev_balance_time = jiffies; - long time_remaining = balanced_irq_interval; - struct irq_desc *desc; - - /* push everything to CPU 0 to give us a starting point. */ - for (i = 0 ; i < nr_irqs ; i++) { - desc = irq_to_desc(i); - desc->pending_mask = cpumask_of_cpu(0); - set_pending_irq(i, cpumask_of_cpu(0)); - } - - set_freezable(); - for ( ; ; ) { - time_remaining = schedule_timeout_interruptible(time_remaining); - try_to_freeze(); - if (time_after(jiffies, - prev_balance_time+balanced_irq_interval)) { - preempt_disable(); - do_irq_balance(); - prev_balance_time = jiffies; - time_remaining = balanced_irq_interval; - preempt_enable(); - } - } - return 0; -} - -static int __init balanced_irq_init(void) -{ - int i; - struct cpuinfo_x86 *c; - cpumask_t tmp; - - cpus_shift_right(tmp, cpu_online_map, 2); - c = &boot_cpu_data; - /* When not overwritten by the command line ask subarchitecture. */ - if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH) - irqbalance_disabled = NO_BALANCE_IRQ; - if (irqbalance_disabled) - return 0; - - /* disable irqbalance completely if there is only one processor online */ - if (num_online_cpus() < 2) { - irqbalance_disabled = 1; - return 0; - } - /* - * Enable physical balance only if more than 1 physical processor - * is present - */ - if (smp_num_siblings > 1 && !cpus_empty(tmp)) - physical_balance = 1; - - for_each_online_cpu(i) { - irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * nr_irqs, GFP_KERNEL); - irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * nr_irqs, GFP_KERNEL); - if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) { - printk(KERN_ERR "balanced_irq_init: out of memory"); - goto failed; - } - } - - printk(KERN_INFO "Starting balanced_irq\n"); - if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd"))) - return 0; - printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq"); -failed: - for_each_possible_cpu(i) { - kfree(irq_cpu_data[i].irq_delta); - irq_cpu_data[i].irq_delta = NULL; - kfree(irq_cpu_data[i].last_irq); - irq_cpu_data[i].last_irq = NULL; - } - return 0; -} - -int __devinit irqbalance_disable(char *str) -{ - irqbalance_disabled = 1; - return 1; -} - -__setup("noirqbalance", irqbalance_disable); - -late_initcall(balanced_irq_init); -#endif /* CONFIG_IRQBALANCE */ #endif /* CONFIG_SMP */ #ifndef CONFIG_SMP diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index f6a11b9b1f9..67465ed8931 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -35,9 +35,6 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) if (!(word & (1 << 13))) { dev_info(&dev->dev, "Intel E7520/7320/7525 detected; " "disabling irq balancing and affinity\n"); -#ifdef CONFIG_IRQBALANCE - irqbalance_disable(""); -#endif noirqdebug_setup(""); #ifdef CONFIG_PROC_FS no_irq_affinity = 1; diff --git a/include/linux/irq.h b/include/linux/irq.h index 704136138dc..2445d2b3d5d 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -185,7 +185,7 @@ struct irq_desc { cpumask_t affinity; unsigned int cpu; #endif -#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE) +#ifdef CONFIG_GENERIC_PENDING_IRQ cpumask_t pending_mask; #endif #ifdef CONFIG_PROC_FS @@ -241,13 +241,13 @@ extern int setup_irq(unsigned int irq, struct irqaction *new); #ifdef CONFIG_SMP -#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE) +#ifdef CONFIG_GENERIC_PENDING_IRQ void set_pending_irq(unsigned int irq, cpumask_t mask); void move_native_irq(int irq); void move_masked_irq(int irq); -#else /* CONFIG_GENERIC_PENDING_IRQ || CONFIG_IRQBALANCE */ +#else /* CONFIG_GENERIC_PENDING_IRQ */ static inline void move_irq(int irq) { @@ -274,14 +274,6 @@ static inline void set_pending_irq(unsigned int irq, cpumask_t mask) #endif /* CONFIG_SMP */ -#ifdef CONFIG_IRQBALANCE -extern void set_balance_irq_affinity(unsigned int irq, cpumask_t mask); -#else -static inline void set_balance_irq_affinity(unsigned int irq, cpumask_t mask) -{ -} -#endif - extern int no_irq_affinity; static inline int irq_balancing_disabled(unsigned int irq) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 6df49218632..ddc956861a5 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -86,8 +86,6 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) if (!desc->chip->set_affinity) return -EINVAL; - set_balance_irq_affinity(irq, cpumask); - #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PCNTXT) { unsigned long flags; @@ -122,7 +120,6 @@ int irq_select_affinity(unsigned int irq) desc->affinity = mask; desc->chip->set_affinity(irq, mask); - set_balance_irq_affinity(irq, mask); return 0; } #endif -- cgit v1.2.3-70-g09d2 From a1420f395d7721895c05ba3dedf150294d2c0e4d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:24 -0700 Subject: x86: add irq_cfg for 32bit it only contains vector ... Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_32.c | 71 ++++++++++++++++++++++++++++---------------- 1 file changed, 46 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 668edf22606..033ad953bee 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -94,6 +94,43 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); static int disable_timer_pin_1 __initdata; +struct irq_cfg { + u8 vector; +}; + + +/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ +static struct irq_cfg irq_cfg_legacy[] __initdata = { + [0] = { .vector = FIRST_DEVICE_VECTOR, }, +}; + +static void __init init_work(void *data) +{ + struct dyn_array *da = data; + struct irq_cfg *cfg; + int legacy_count; + int i; + + cfg = *da->name; + + legacy_count = sizeof(irq_cfg_legacy)/sizeof(irq_cfg_legacy[0]); + + BUG_ON(legacy_count > nr_irqs); + + memcpy(cfg, irq_cfg_legacy, sizeof(irq_cfg_legacy)); +} + +static struct irq_cfg *irq_cfgx; +DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irqs, PAGE_SIZE, init_work); + +static struct irq_cfg *irq_cfg(unsigned int irq) +{ + if (irq >= nr_irqs) + return NULL; + + return &irq_cfgx[irq]; +} + /* * Rough estimation of how many shared IRQs there are, can * be changed anytime. @@ -794,22 +831,6 @@ static inline int IO_APIC_irq_trigger(int irq) return 0; } -/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ -static u8 irq_vector_init_first __initdata = FIRST_DEVICE_VECTOR; -static u8 *irq_vector; - -static void __init irq_vector_init_work(void *data) -{ - struct dyn_array *da = data; - - u8 *irq_vec; - - irq_vec = *da->name; - - irq_vec[0] = irq_vector_init_first; -} - -DEFINE_DYN_ARRAY(irq_vector, sizeof(u8), nr_irqs, PAGE_SIZE, irq_vector_init_work); static int __assign_irq_vector(int irq) { @@ -818,8 +839,8 @@ static int __assign_irq_vector(int irq) BUG_ON((unsigned)irq >= nr_irqs); - if (irq_vector[irq] > 0) - return irq_vector[irq]; + if (irq_cfg(irq)->vector > 0) + return irq_cfg(irq)->vector; vector = current_vector; offset = current_offset; @@ -836,7 +857,7 @@ next: current_vector = vector; current_offset = offset; - irq_vector[irq] = vector; + irq_cfg(irq)->vector = vector; return vector; } @@ -1598,7 +1619,7 @@ static void ack_ioapic_quirk_irq(unsigned int irq) * operation to prevent an edge-triggered interrupt escaping meanwhile. * The idea is from Manfred Spraul. --macro */ - i = irq_vector[irq]; + i = irq_cfg(irq)->vector; v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); @@ -1615,7 +1636,7 @@ static void ack_ioapic_quirk_irq(unsigned int irq) static int ioapic_retrigger_irq(unsigned int irq) { - send_IPI_self(irq_vector[irq]); + send_IPI_self(irq_cfg(irq)->vector); return 1; } @@ -1651,7 +1672,7 @@ static inline void init_IO_APIC_traps(void) * 0x80, because int 0x80 is hm, kind of importantish. ;) */ for (irq = 0; irq < nr_irqs ; irq++) { - if (IO_APIC_IRQ(irq) && !irq_vector[irq]) { + if (IO_APIC_IRQ(irq) && !irq_cfg(irq)->vector) { /* * Hmm.. We don't have an entry for this, * so default to an old-fashioned 8259 @@ -2102,7 +2123,7 @@ int create_irq(void) for (new = (nr_irqs - 1); new >= 0; new--) { if (platform_legacy_irq(new)) continue; - if (irq_vector[new] != 0) + if (irq_cfg(new)->vector != 0) continue; vector = __assign_irq_vector(new); if (likely(vector > 0)) @@ -2125,8 +2146,8 @@ void destroy_irq(unsigned int irq) dynamic_irq_cleanup(irq); spin_lock_irqsave(&vector_lock, flags); - clear_bit(irq_vector[irq], used_vectors); - irq_vector[irq] = 0; + clear_bit(irq_cfg(irq)->vector, used_vectors); + irq_cfg(irq)->vector = 0; spin_unlock_irqrestore(&vector_lock, flags); } -- cgit v1.2.3-70-g09d2 From da51a821314dd0ec7126f2bf9a62117146fbb6b9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:25 -0700 Subject: x86: make 32bit use irq_cfg_alloc, etc Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_32.c | 180 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 160 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 033ad953bee..5a83d7f5b14 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -94,41 +94,172 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); static int disable_timer_pin_1 __initdata; +struct irq_cfg; + struct irq_cfg { + unsigned int irq; + struct irq_cfg *next; u8 vector; }; /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ static struct irq_cfg irq_cfg_legacy[] __initdata = { - [0] = { .vector = FIRST_DEVICE_VECTOR, }, + [0] = { .irq = 0, .vector = IRQ0_VECTOR, }, + [1] = { .irq = 1, .vector = IRQ1_VECTOR, }, + [2] = { .irq = 2, .vector = IRQ2_VECTOR, }, + [3] = { .irq = 3, .vector = IRQ3_VECTOR, }, + [4] = { .irq = 4, .vector = IRQ4_VECTOR, }, + [5] = { .irq = 5, .vector = IRQ5_VECTOR, }, + [6] = { .irq = 6, .vector = IRQ6_VECTOR, }, + [7] = { .irq = 7, .vector = IRQ7_VECTOR, }, + [8] = { .irq = 8, .vector = IRQ8_VECTOR, }, + [9] = { .irq = 9, .vector = IRQ9_VECTOR, }, + [10] = { .irq = 10, .vector = IRQ10_VECTOR, }, + [11] = { .irq = 11, .vector = IRQ11_VECTOR, }, + [12] = { .irq = 12, .vector = IRQ12_VECTOR, }, + [13] = { .irq = 13, .vector = IRQ13_VECTOR, }, + [14] = { .irq = 14, .vector = IRQ14_VECTOR, }, + [15] = { .irq = 15, .vector = IRQ15_VECTOR, }, }; +static struct irq_cfg irq_cfg_init = { .irq = -1U, }; +/* need to be biger than size of irq_cfg_legacy */ +static int nr_irq_cfg = 32; + +static int __init parse_nr_irq_cfg(char *arg) +{ + if (arg) { + nr_irq_cfg = simple_strtoul(arg, NULL, 0); + if (nr_irq_cfg < 32) + nr_irq_cfg = 32; + } + return 0; +} + +early_param("nr_irq_cfg", parse_nr_irq_cfg); + +static void init_one_irq_cfg(struct irq_cfg *cfg) +{ + memcpy(cfg, &irq_cfg_init, sizeof(struct irq_cfg)); +} + +static struct irq_cfg *irq_cfgx; +static struct irq_cfg *irq_cfgx_free; static void __init init_work(void *data) { - struct dyn_array *da = data; - struct irq_cfg *cfg; - int legacy_count; - int i; + struct dyn_array *da = data; + struct irq_cfg *cfg; + int legacy_count; + int i; + + cfg = *da->name; - cfg = *da->name; + memcpy(cfg, irq_cfg_legacy, sizeof(irq_cfg_legacy)); - legacy_count = sizeof(irq_cfg_legacy)/sizeof(irq_cfg_legacy[0]); + legacy_count = sizeof(irq_cfg_legacy)/sizeof(irq_cfg_legacy[0]); + for (i = legacy_count; i < *da->nr; i++) + init_one_irq_cfg(&cfg[i]); - BUG_ON(legacy_count > nr_irqs); + for (i = 1; i < *da->nr; i++) + cfg[i-1].next = &cfg[i]; - memcpy(cfg, irq_cfg_legacy, sizeof(irq_cfg_legacy)); + irq_cfgx_free = &irq_cfgx[legacy_count]; + irq_cfgx[legacy_count - 1].next = NULL; } -static struct irq_cfg *irq_cfgx; -DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irqs, PAGE_SIZE, init_work); +#define for_each_irq_cfg(cfg) \ + for (cfg = irq_cfgx; cfg; cfg = cfg->next) + +DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irq_cfg, PAGE_SIZE, init_work); static struct irq_cfg *irq_cfg(unsigned int irq) { - if (irq >= nr_irqs) - return NULL; + struct irq_cfg *cfg; + + cfg = irq_cfgx; + while (cfg) { + if (cfg->irq == irq) + return cfg; - return &irq_cfgx[irq]; + cfg = cfg->next; + } + + return NULL; +} + +static struct irq_cfg *irq_cfg_alloc(unsigned int irq) +{ + struct irq_cfg *cfg, *cfg_pri; + int i; + int count = 0; + + cfg_pri = cfg = irq_cfgx; + while (cfg) { + if (cfg->irq == irq) + return cfg; + + cfg_pri = cfg; + cfg = cfg->next; + count++; + } + + if (!irq_cfgx_free) { + unsigned long phys; + unsigned long total_bytes; + /* + * we run out of pre-allocate ones, allocate more + */ + printk(KERN_DEBUG "try to get more irq_cfg %d\n", nr_irq_cfg); + + total_bytes = sizeof(struct irq_cfg) * nr_irq_cfg; + if (after_bootmem) + cfg = kzalloc(total_bytes, GFP_ATOMIC); + else + cfg = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0); + + if (!cfg) + panic("please boot with nr_irq_cfg= %d\n", count * 2); + + phys = __pa(cfg); + printk(KERN_DEBUG "irq_irq ==> [%#lx - %#lx]\n", phys, phys + total_bytes); + + for (i = 0; i < nr_irq_cfg; i++) + init_one_irq_cfg(&cfg[i]); + + for (i = 1; i < nr_irq_cfg; i++) + cfg[i-1].next = &cfg[i]; + + irq_cfgx_free = cfg; + } + + cfg = irq_cfgx_free; + irq_cfgx_free = irq_cfgx_free->next; + cfg->next = NULL; + if (cfg_pri) + cfg_pri->next = cfg; + else + irq_cfgx = cfg; + cfg->irq = irq; + printk(KERN_DEBUG "found new irq_cfg for irq %d\n", cfg->irq); + +#ifdef CONFIG_HAVE_SPARSE_IRQ_DEBUG + { + /* dump the results */ + struct irq_cfg *cfg; + unsigned long phys; + unsigned long bytes = sizeof(struct irq_cfg); + + printk(KERN_DEBUG "=========================== %d\n", irq); + printk(KERN_DEBUG "irq_cfg dump after get that for %d\n", irq); + for_each_irq_cfg(cfg) { + phys = __pa(cfg); + printk(KERN_DEBUG "irq_cfg %d ==> [%#lx - %#lx]\n", cfg->irq, phys, phys + bytes); + } + printk(KERN_DEBUG "===========================\n"); + } +#endif + return cfg; } /* @@ -254,6 +385,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin) { struct irq_pin_list *entry = irq_2_pin + irq; + irq_cfg_alloc(irq); while (entry->next) entry = irq_2_pin + entry->next; @@ -836,11 +968,13 @@ static int __assign_irq_vector(int irq) { static int current_vector = FIRST_DEVICE_VECTOR, current_offset; int vector, offset; + struct irq_cfg *cfg; BUG_ON((unsigned)irq >= nr_irqs); - if (irq_cfg(irq)->vector > 0) - return irq_cfg(irq)->vector; + cfg = irq_cfg(irq); + if (cfg->vector > 0) + return cfg->vector; vector = current_vector; offset = current_offset; @@ -857,7 +991,7 @@ next: current_vector = vector; current_offset = offset; - irq_cfg(irq)->vector = vector; + cfg->vector = vector; return vector; } @@ -1659,6 +1793,7 @@ static inline void init_IO_APIC_traps(void) { int irq; struct irq_desc *desc; + struct irq_cfg *cfg; /* * NOTE! The local APIC isn't very good at handling @@ -1671,8 +1806,9 @@ static inline void init_IO_APIC_traps(void) * Also, we've got to be careful not to trash gate * 0x80, because int 0x80 is hm, kind of importantish. ;) */ - for (irq = 0; irq < nr_irqs ; irq++) { - if (IO_APIC_IRQ(irq) && !irq_cfg(irq)->vector) { + for_each_irq_cfg(cfg) { + irq = cfg->irq; + if (IO_APIC_IRQ(irq) && !cfg->vector) { /* * Hmm.. We don't have an entry for this, * so default to an old-fashioned 8259 @@ -2117,14 +2253,18 @@ int create_irq(void) /* Allocate an unused irq */ int irq, new, vector = 0; unsigned long flags; + struct irq_cfg *cfg_new; irq = -ENOSPC; spin_lock_irqsave(&vector_lock, flags); for (new = (nr_irqs - 1); new >= 0; new--) { if (platform_legacy_irq(new)) continue; - if (irq_cfg(new)->vector != 0) + cfg_new = irq_cfg(new); + if (cfg_new && cfg_new->vector != 0) continue; + if (!cfg_new) + cfg_new = irq_cfg_alloc(new); vector = __assign_irq_vector(new); if (likely(vector > 0)) irq = new; -- cgit v1.2.3-70-g09d2 From 0f978f4505e96227a89b3c9447552aca983c6b57 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:26 -0700 Subject: x86: make 32bit to use irq_2_pin in irq_cfg so it is more like 64 bit. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_32.c | 152 +++++++++++++++++++++++++++++++++---------- 1 file changed, 117 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 5a83d7f5b14..59d2e8a273e 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -95,10 +95,11 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); static int disable_timer_pin_1 __initdata; struct irq_cfg; - +struct irq_pin_list; struct irq_cfg { unsigned int irq; struct irq_cfg *next; + struct irq_pin_list *irq_2_pin; u8 vector; }; @@ -275,11 +276,66 @@ int pin_map_size; * between pins and IRQs. */ -static struct irq_pin_list { - int apic, pin, next; -} *irq_2_pin; +struct irq_pin_list { + int apic, pin; + struct irq_pin_list *next; +}; + +static struct irq_pin_list *irq_2_pin_head; +/* fill one page ? */ +static int nr_irq_2_pin = 0x100; +static struct irq_pin_list *irq_2_pin_ptr; +static void __init irq_2_pin_init_work(void *data) +{ + struct dyn_array *da = data; + struct irq_pin_list *pin; + int i; + + pin = *da->name; + + for (i = 1; i < *da->nr; i++) + pin[i-1].next = &pin[i]; + + irq_2_pin_ptr = &pin[0]; +} +DEFINE_DYN_ARRAY(irq_2_pin_head, sizeof(struct irq_pin_list), nr_irq_2_pin, PAGE_SIZE, irq_2_pin_init_work); + +static struct irq_pin_list *get_one_free_irq_2_pin(void) +{ + struct irq_pin_list *pin; + int i; + + pin = irq_2_pin_ptr; + + if (pin) { + irq_2_pin_ptr = pin->next; + pin->next = NULL; + return pin; + } + + /* + * we run out of pre-allocate ones, allocate more + */ + printk(KERN_DEBUG "try to get more irq_2_pin %d\n", nr_irq_2_pin); + + if (after_bootmem) + pin = kzalloc(sizeof(struct irq_pin_list)*nr_irq_2_pin, + GFP_ATOMIC); + else + pin = __alloc_bootmem_nopanic(sizeof(struct irq_pin_list) * + nr_irq_2_pin, PAGE_SIZE, 0); + + if (!pin) + panic("can not get more irq_2_pin\n"); -DEFINE_DYN_ARRAY(irq_2_pin, sizeof(struct irq_pin_list), pin_map_size, 16, NULL); + for (i = 1; i < nr_irq_2_pin; i++) + pin[i-1].next = &pin[i]; + + irq_2_pin_ptr = pin->next; + pin->next = NULL; + + return pin; +} struct io_apic { unsigned int index; @@ -383,20 +439,34 @@ static void ioapic_mask_entry(int apic, int pin) */ static void add_pin_to_irq(unsigned int irq, int apic, int pin) { - struct irq_pin_list *entry = irq_2_pin + irq; + struct irq_cfg *cfg; + struct irq_pin_list *entry; + + /* first time to refer irq_cfg, so with new */ + cfg = irq_cfg_alloc(irq); + entry = cfg->irq_2_pin; + if (!entry) { + entry = get_one_free_irq_2_pin(); + cfg->irq_2_pin = entry; + entry->apic = apic; + entry->pin = pin; + printk(KERN_DEBUG " 0 add_pin_to_irq: irq %d --> apic %d pin %d\n", irq, apic, pin); + return; + } - irq_cfg_alloc(irq); - while (entry->next) - entry = irq_2_pin + entry->next; + while (entry->next) { + /* not again, please */ + if (entry->apic == apic && entry->pin == pin) + return; - if (entry->pin != -1) { - entry->next = first_free_entry; - entry = irq_2_pin + entry->next; - if (++first_free_entry >= pin_map_size) - panic("io_apic.c: whoops"); + entry = entry->next; } + + entry->next = get_one_free_irq_2_pin(); + entry = entry->next; entry->apic = apic; entry->pin = pin; + printk(KERN_DEBUG " x add_pin_to_irq: irq %d --> apic %d pin %d\n", irq, apic, pin); } /* @@ -406,35 +476,45 @@ static void __init replace_pin_at_irq(unsigned int irq, int oldapic, int oldpin, int newapic, int newpin) { - struct irq_pin_list *entry = irq_2_pin + irq; + struct irq_cfg *cfg = irq_cfg(irq); + struct irq_pin_list *entry = cfg->irq_2_pin; + int replaced = 0; - while (1) { + while (entry) { if (entry->apic == oldapic && entry->pin == oldpin) { entry->apic = newapic; entry->pin = newpin; - } - if (!entry->next) + replaced = 1; + /* every one is different, right? */ break; - entry = irq_2_pin + entry->next; + } + entry = entry->next; } + + /* why? call replace before add? */ + if (!replaced) + add_pin_to_irq(irq, newapic, newpin); } static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable) { - struct irq_pin_list *entry = irq_2_pin + irq; + struct irq_cfg *cfg; + struct irq_pin_list *entry; unsigned int pin, reg; + cfg = irq_cfg(irq); + entry = cfg->irq_2_pin; for (;;) { - pin = entry->pin; - if (pin == -1) + if (!entry) break; + pin = entry->pin; reg = io_apic_read(entry->apic, 0x10 + pin*2); reg &= ~disable; reg |= enable; io_apic_modify(entry->apic, 0x10 + pin*2, reg); if (!entry->next) break; - entry = irq_2_pin + entry->next; + entry = entry->next; } } @@ -509,13 +589,18 @@ static void clear_IO_APIC(void) #ifdef CONFIG_SMP static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) { + struct irq_cfg *cfg; unsigned long flags; int pin; - struct irq_pin_list *entry = irq_2_pin + irq; + struct irq_pin_list *entry; unsigned int apicid_value; cpumask_t tmp; struct irq_desc *desc; + + cfg = irq_cfg(irq); + entry = cfg->irq_2_pin; + cpus_and(tmp, cpumask, cpu_online_map); if (cpus_empty(tmp)) tmp = TARGET_CPUS; @@ -527,13 +612,13 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) apicid_value = apicid_value << 24; spin_lock_irqsave(&ioapic_lock, flags); for (;;) { - pin = entry->pin; - if (pin == -1) + if (!entry) break; + pin = entry->pin; io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value); if (!entry->next) break; - entry = irq_2_pin + entry->next; + entry = entry->next; } desc = irq_to_desc(irq); desc->affinity = cpumask; @@ -1152,6 +1237,7 @@ __apicdebuginit(void) print_IO_APIC(void) union IO_APIC_reg_02 reg_02; union IO_APIC_reg_03 reg_03; unsigned long flags; + struct irq_cfg *cfg; if (apic_verbosity == APIC_QUIET) return; @@ -1240,16 +1326,16 @@ __apicdebuginit(void) print_IO_APIC(void) } } printk(KERN_DEBUG "IRQ to pin mappings:\n"); - for (i = 0; i < nr_irqs; i++) { - struct irq_pin_list *entry = irq_2_pin + i; - if (entry->pin < 0) + for_each_irq_cfg(cfg) { + struct irq_pin_list *entry = cfg->irq_2_pin; + if (!entry) continue; printk(KERN_DEBUG "IRQ%d ", i); for (;;) { printk("-> %d:%d", entry->apic, entry->pin); if (!entry->next) break; - entry = irq_2_pin + entry->next; + entry = entry->next; } printk("\n"); } @@ -1420,10 +1506,6 @@ static void __init enable_IO_APIC(void) int i, apic; unsigned long flags; - for (i = 0; i < pin_map_size; i++) { - irq_2_pin[i].pin = -1; - irq_2_pin[i].next = 0; - } if (!pirqs_enabled) for (i = 0; i < MAX_PIRQS; i++) pirq_entries[i] = -1; -- cgit v1.2.3-70-g09d2 From 199751d715bba5b469ea22adadc68a4166bfa4f5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:27 -0700 Subject: x86: make 32 bit to use sparse_irq but actually irq still needs to be less than NR_IRQS, because interrupt[NR_IRQS] in entry.S. need to enable per_cpu vector... Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- arch/x86/kernel/io_apic_32.c | 63 ++++++++++++++++++++++++++++++-------------- arch/x86/kernel/irq_32.c | 37 +++++++++++++++++++------- arch/x86/kernel/irqinit_32.c | 7 +++++ 4 files changed, 79 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3e0eaaa1a33..b157e94637b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -34,7 +34,7 @@ config X86 select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_DYN_ARRAY - select HAVE_SPARSE_IRQ if X86_64 + select HAVE_SPARSE_IRQ config ARCH_DEFCONFIG string diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 59d2e8a273e..66c0a91362a 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -595,7 +595,6 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) struct irq_pin_list *entry; unsigned int apicid_value; cpumask_t tmp; - struct irq_desc *desc; cfg = irq_cfg(irq); @@ -620,8 +619,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) break; entry = entry->next; } - desc = irq_to_desc(irq); - desc->affinity = cpumask; + irq_to_desc(irq)->affinity = cpumask; spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -1055,8 +1053,6 @@ static int __assign_irq_vector(int irq) int vector, offset; struct irq_cfg *cfg; - BUG_ON((unsigned)irq >= nr_irqs); - cfg = irq_cfg(irq); if (cfg->vector > 0) return cfg->vector; @@ -1103,7 +1099,12 @@ static void ioapic_register_intr(int irq, int vector, unsigned long trigger) { struct irq_desc *desc; - desc = irq_to_desc(irq); + /* first time to use this irq_desc */ + if (irq < 16) + desc = irq_to_desc(irq); + else + desc = irq_to_desc_alloc(irq); + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || trigger == IOAPIC_LEVEL) { desc->status |= IRQ_LEVEL; @@ -2330,16 +2331,19 @@ device_initcall(ioapic_init_sysfs); /* * Dynamic irq allocate and deallocation */ -int create_irq(void) +unsigned int create_irq_nr(unsigned int irq_want) { /* Allocate an unused irq */ - int irq, new, vector = 0; + unsigned int irq, new, vector = 0; unsigned long flags; struct irq_cfg *cfg_new; - irq = -ENOSPC; + /* only can use bus/dev/fn.. when per_cpu vector is used */ + irq_want = nr_irqs - 1; + + irq = 0; spin_lock_irqsave(&vector_lock, flags); - for (new = (nr_irqs - 1); new >= 0; new--) { + for (new = (nr_irqs - 1); new > 0; new--) { if (platform_legacy_irq(new)) continue; cfg_new = irq_cfg(new); @@ -2354,13 +2358,18 @@ int create_irq(void) } spin_unlock_irqrestore(&vector_lock, flags); - if (irq >= 0) { + if (irq > 0) { set_intr_gate(vector, interrupt[irq]); dynamic_irq_init(irq); } return irq; } +int create_irq(void) +{ + return create_irq_nr(nr_irqs - 1); +} + void destroy_irq(unsigned int irq) { unsigned long flags; @@ -2415,7 +2424,6 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) unsigned int dest; cpumask_t tmp; int vector; - struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2435,8 +2443,7 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); write_msi_msg(irq, &msg); - desc = irq_to_desc(irq); - desc->affinity = mask; + irq_to_desc(irq)->affinity = mask; } #endif /* CONFIG_SMP */ @@ -2455,13 +2462,31 @@ static struct irq_chip msi_chip = { .retrigger = ioapic_retrigger_irq, }; +static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) +{ + unsigned int irq; + + irq = dev->bus->number; + irq <<= 8; + irq |= dev->devfn; + irq <<= 12; + + return irq; +} + int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) { struct msi_msg msg; int irq, ret; - irq = create_irq(); - if (irq < 0) - return irq; + + unsigned int irq_want; + + irq_want = build_irq_for_pci_dev(dev) + 0x100; + + irq = create_irq_nr(irq_want); + + if (irq == 0) + return -1; ret = msi_compose_msg(dev, irq, &msg); if (ret < 0) { @@ -2510,7 +2535,6 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) { unsigned int dest; cpumask_t tmp; - struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2521,8 +2545,7 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) dest = cpu_mask_to_apicid(mask); target_ht_irq(irq, dest); - desc = irq_to_desc(irq); - desc->affinity = mask; + irq_to_desc(irq)->affinity = mask; } #endif diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 576c5df6cad..0a57e39159a 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -224,9 +224,10 @@ unsigned int do_IRQ(struct pt_regs *regs) struct pt_regs *old_regs; /* high bit used in ret_from_ code */ int overflow, irq = ~regs->orig_ax; - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc; - if (unlikely((unsigned)irq >= nr_irqs)) { + desc = irq_to_desc(irq); + if (unlikely(!desc)) { printk(KERN_EMERG "%s: cannot handle IRQ %d\n", __func__, irq); BUG(); @@ -263,6 +264,24 @@ int show_interrupts(struct seq_file *p, void *v) int i = *(loff_t *) v, j; struct irqaction * action; unsigned long flags; + unsigned int entries; + struct irq_desc *desc; + int tail = 0; + +#ifdef CONFIG_HAVE_SPARSE_IRQ + desc = (struct irq_desc *)v; + entries = -1U; + i = desc->irq; + if (!desc->next) + tail = 1; +#else + entries = nr_irqs - 1; + i = *(loff_t *) v; + if (i == nr_irqs) + tail = 1; + else + desc = irq_to_desc(i); +#endif if (i == 0) { seq_printf(p, " "); @@ -271,9 +290,8 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); } - if (i < nr_irqs) { + if (i <= entries) { unsigned any_count = 0; - struct irq_desc *desc = irq_to_desc(i); spin_lock_irqsave(&desc->lock, flags); #ifndef CONFIG_SMP @@ -285,7 +303,7 @@ int show_interrupts(struct seq_file *p, void *v) action = desc->action; if (!action && !any_count) goto skip; - seq_printf(p, "%3d: ",i); + seq_printf(p, "%#x: ",i); #ifndef CONFIG_SMP seq_printf(p, "%10u ", kstat_irqs(i)); #else @@ -304,7 +322,9 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); skip: spin_unlock_irqrestore(&desc->lock, flags); - } else if (i == nr_irqs) { + } + + if (tail) { seq_printf(p, "NMI: "); for_each_online_cpu(j) seq_printf(p, "%10u ", nmi_count(j)); @@ -396,15 +416,14 @@ void fixup_irqs(cpumask_t map) { unsigned int irq; static int warned; + struct irq_desc *desc; - for (irq = 0; irq < nr_irqs; irq++) { + for_each_irq_desc(irq, desc) { cpumask_t mask; - struct irq_desc *desc; if (irq == 2) continue; - desc = irq_to_desc(irq); cpus_and(mask, desc->affinity, map); if (any_online_cpu(mask) == NR_CPUS) { printk("Breaking affinity for irq %i\n", irq); diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 65c1c950770..ded09ac2642 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -69,6 +69,13 @@ void __init init_ISA_irqs (void) * 16 old-style INTA-cycle interrupts: */ for (i = 0; i < 16; i++) { + /* first time call this irq_desc */ + struct irq_desc *desc = irq_to_desc_alloc(i); + + desc->status = IRQ_DISABLED; + desc->action = NULL; + desc->depth = 1; + set_irq_chip_and_handler_name(i, &i8259A_chip, handle_level_irq, "XT"); } -- cgit v1.2.3-70-g09d2 From 497c9a195db918d3f035e8cb3021e5d4d035516e Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:28 -0700 Subject: x86: make 32bit support per_cpu vector so we can merge io_apic_32.c and io_apic_64.c v2: Use cpu_online_map as target cpus for bigsmp, just like 64-bit is doing. Also remove some unused TARGET_CPUS macro. v3: need to check if desc is null in smp_irq_move_cleanup also migration needs to reset vector too, so copy __target_IO_APIC_irq from 64bit. (the duplication will go away once the two files are unified.) Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 2 +- arch/x86/kernel/io_apic_32.c | 719 +++++++++++++++++++----------- arch/x86/kernel/irq_32.c | 18 +- arch/x86/kernel/irqinit_32.c | 41 +- arch/x86/lguest/boot.c | 2 +- arch/x86/mach-generic/bigsmp.c | 4 + arch/x86/mach-generic/es7000.c | 14 + arch/x86/mach-generic/numaq.c | 14 + arch/x86/mach-generic/summit.c | 14 + include/asm-x86/bigsmp/apic.h | 15 +- include/asm-x86/es7000/apic.h | 3 +- include/asm-x86/genapic_32.h | 2 + include/asm-x86/hw_irq.h | 6 +- include/asm-x86/irq_vectors.h | 15 +- include/asm-x86/mach-default/entry_arch.h | 1 + include/asm-x86/mach-default/mach_apic.h | 15 +- include/asm-x86/mach-generic/mach_apic.h | 1 + include/asm-x86/numaq/apic.h | 2 - include/asm-x86/summit/apic.h | 1 - 19 files changed, 577 insertions(+), 312 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index b21fbfaffe3..4d82171d0f9 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -629,7 +629,7 @@ ENTRY(interrupt) ENTRY(irq_entries_start) RING0_INT_FRAME vector=0 -.rept NR_IRQS +.rept NR_VECTORS ALIGN .if vector CFI_ADJUST_CFA_OFFSET -4 diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 66c0a91362a..ea33d3c7497 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -48,6 +48,7 @@ #include #include +#include #include #include @@ -60,7 +61,7 @@ atomic_t irq_mis_count; static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; static DEFINE_SPINLOCK(ioapic_lock); -DEFINE_SPINLOCK(vector_lock); +static DEFINE_SPINLOCK(vector_lock); int timer_through_8259 __initdata; @@ -100,28 +101,32 @@ struct irq_cfg { unsigned int irq; struct irq_cfg *next; struct irq_pin_list *irq_2_pin; + cpumask_t domain; + cpumask_t old_domain; + unsigned move_cleanup_count; u8 vector; + u8 move_in_progress : 1; }; /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ static struct irq_cfg irq_cfg_legacy[] __initdata = { - [0] = { .irq = 0, .vector = IRQ0_VECTOR, }, - [1] = { .irq = 1, .vector = IRQ1_VECTOR, }, - [2] = { .irq = 2, .vector = IRQ2_VECTOR, }, - [3] = { .irq = 3, .vector = IRQ3_VECTOR, }, - [4] = { .irq = 4, .vector = IRQ4_VECTOR, }, - [5] = { .irq = 5, .vector = IRQ5_VECTOR, }, - [6] = { .irq = 6, .vector = IRQ6_VECTOR, }, - [7] = { .irq = 7, .vector = IRQ7_VECTOR, }, - [8] = { .irq = 8, .vector = IRQ8_VECTOR, }, - [9] = { .irq = 9, .vector = IRQ9_VECTOR, }, - [10] = { .irq = 10, .vector = IRQ10_VECTOR, }, - [11] = { .irq = 11, .vector = IRQ11_VECTOR, }, - [12] = { .irq = 12, .vector = IRQ12_VECTOR, }, - [13] = { .irq = 13, .vector = IRQ13_VECTOR, }, - [14] = { .irq = 14, .vector = IRQ14_VECTOR, }, - [15] = { .irq = 15, .vector = IRQ15_VECTOR, }, + [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, + [1] = { .irq = 1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, + [2] = { .irq = 2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, + [3] = { .irq = 3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, }, + [4] = { .irq = 4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, }, + [5] = { .irq = 5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, }, + [6] = { .irq = 6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, }, + [7] = { .irq = 7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, }, + [8] = { .irq = 8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, }, + [9] = { .irq = 9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, }, + [10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, + [11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, + [12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, + [13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, + [14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, + [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, }; static struct irq_cfg irq_cfg_init = { .irq = -1U, }; @@ -263,6 +268,7 @@ static struct irq_cfg *irq_cfg_alloc(unsigned int irq) return cfg; } +static int assign_irq_vector(int irq, cpumask_t mask); /* * Rough estimation of how many shared IRQs there are, can * be changed anytime. @@ -432,6 +438,65 @@ static void ioapic_mask_entry(int apic, int pin) spin_unlock_irqrestore(&ioapic_lock, flags); } +#ifdef CONFIG_SMP +static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) +{ + int apic, pin; + struct irq_cfg *cfg; + struct irq_pin_list *entry; + + cfg = irq_cfg(irq); + entry = cfg->irq_2_pin; + for (;;) { + unsigned int reg; + + if (!entry) + break; + + apic = entry->apic; + pin = entry->pin; + io_apic_write(apic, 0x11 + pin*2, dest); + reg = io_apic_read(apic, 0x10 + pin*2); + reg &= ~IO_APIC_REDIR_VECTOR_MASK; + reg |= vector; + io_apic_modify(apic, 0x10 + pin *2, reg); + if (!entry->next) + break; + entry = entry->next; + } +} +static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) +{ + struct irq_cfg *cfg; + unsigned long flags; + unsigned int dest; + cpumask_t tmp; + + cfg = irq_cfg(irq); + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cpus_and(tmp, cfg->domain, mask); + + dest = cpu_mask_to_apicid(tmp); + /* + * Only the high 8 bits are valid. + */ + dest = SET_APIC_LOGICAL_ID(dest); + + spin_lock_irqsave(&ioapic_lock, flags); + __target_IO_APIC_irq(irq, dest, cfg->vector); + irq_to_desc(irq)->affinity = mask; + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +#endif /* CONFIG_SMP */ + /* * The common case is 1:1 IRQ<->pin mappings. Sometimes there are * shared ISA-space IRQs, so we have to support them. We are super @@ -586,45 +651,6 @@ static void clear_IO_APIC(void) clear_IO_APIC_pin(apic, pin); } -#ifdef CONFIG_SMP -static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) -{ - struct irq_cfg *cfg; - unsigned long flags; - int pin; - struct irq_pin_list *entry; - unsigned int apicid_value; - cpumask_t tmp; - - - cfg = irq_cfg(irq); - entry = cfg->irq_2_pin; - - cpus_and(tmp, cpumask, cpu_online_map); - if (cpus_empty(tmp)) - tmp = TARGET_CPUS; - - cpus_and(cpumask, tmp, CPU_MASK_ALL); - - apicid_value = cpu_mask_to_apicid(cpumask); - /* Prepare to do the io_apic_write */ - apicid_value = apicid_value << 24; - spin_lock_irqsave(&ioapic_lock, flags); - for (;;) { - if (!entry) - break; - pin = entry->pin; - io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value); - if (!entry->next) - break; - entry = entry->next; - } - irq_to_desc(irq)->affinity = cpumask; - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -#endif /* CONFIG_SMP */ - #ifndef CONFIG_SMP void send_IPI_self(int vector) { @@ -789,32 +815,6 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) } EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); -/* - * This function currently is only a helper for the i386 smp boot process where - * we need to reprogram the ioredtbls to cater for the cpus which have come online - * so mask in all cases should simply be TARGET_CPUS - */ -#ifdef CONFIG_SMP -void __init setup_ioapic_dest(void) -{ - int pin, ioapic, irq, irq_entry; - - if (skip_ioapic_setup == 1) - return; - - for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { - for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { - irq_entry = find_irq_entry(ioapic, pin, mp_INT); - if (irq_entry == -1) - continue; - irq = pin_2_irq(irq_entry, ioapic, pin); - set_ioapic_affinity_irq(irq, TARGET_CPUS); - } - - } -} -#endif - #if defined(CONFIG_EISA) || defined(CONFIG_MCA) /* * EISA Edge/Level control register, ELCR @@ -1046,47 +1046,138 @@ static inline int IO_APIC_irq_trigger(int irq) return 0; } +void lock_vector_lock(void) +{ + /* Used to the online set of cpus does not change + * during assign_irq_vector. + */ + spin_lock(&vector_lock); +} -static int __assign_irq_vector(int irq) +void unlock_vector_lock(void) { - static int current_vector = FIRST_DEVICE_VECTOR, current_offset; - int vector, offset; - struct irq_cfg *cfg; + spin_unlock(&vector_lock); +} - cfg = irq_cfg(irq); - if (cfg->vector > 0) - return cfg->vector; +static int __assign_irq_vector(int irq, cpumask_t mask) +{ + static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; + unsigned int old_vector; + int cpu; + struct irq_cfg *cfg; - vector = current_vector; - offset = current_offset; -next: - vector += 8; - if (vector >= first_system_vector) { - offset = (offset + 1) % 8; - vector = FIRST_DEVICE_VECTOR + offset; - } - if (vector == current_vector) - return -ENOSPC; - if (test_and_set_bit(vector, used_vectors)) - goto next; + cfg = irq_cfg(irq); - current_vector = vector; - current_offset = offset; - cfg->vector = vector; + /* Only try and allocate irqs on cpus that are present */ + cpus_and(mask, mask, cpu_online_map); - return vector; -} + if ((cfg->move_in_progress) || cfg->move_cleanup_count) + return -EBUSY; -static int assign_irq_vector(int irq) -{ + old_vector = cfg->vector; + if (old_vector) { + cpumask_t tmp; + cpus_and(tmp, cfg->domain, mask); + if (!cpus_empty(tmp)) + return 0; + } + + for_each_cpu_mask_nr(cpu, mask) { + cpumask_t domain, new_mask; + int new_cpu; + int vector, offset; + + domain = vector_allocation_domain(cpu); + cpus_and(new_mask, domain, cpu_online_map); + + vector = current_vector; + offset = current_offset; +next: + vector += 8; + if (vector >= first_system_vector) { + /* If we run out of vectors on large boxen, must share them. */ + offset = (offset + 1) % 8; + vector = FIRST_DEVICE_VECTOR + offset; + } + if (unlikely(current_vector == vector)) + continue; + if (vector == SYSCALL_VECTOR) + goto next; + + for_each_cpu_mask_nr(new_cpu, new_mask) + if (per_cpu(vector_irq, new_cpu)[vector] != -1) + goto next; + /* Found one! */ + current_vector = vector; + current_offset = offset; + if (old_vector) { + cfg->move_in_progress = 1; + cfg->old_domain = cfg->domain; + } + for_each_cpu_mask_nr(new_cpu, new_mask) + per_cpu(vector_irq, new_cpu)[vector] = irq; + cfg->vector = vector; + cfg->domain = domain; + return 0; + } + return -ENOSPC; +} + +static int assign_irq_vector(int irq, cpumask_t mask) +{ + int err; unsigned long flags; - int vector; spin_lock_irqsave(&vector_lock, flags); - vector = __assign_irq_vector(irq); + err = __assign_irq_vector(irq, mask); spin_unlock_irqrestore(&vector_lock, flags); - return vector; + return err; +} + +static void __clear_irq_vector(int irq) +{ + struct irq_cfg *cfg; + cpumask_t mask; + int cpu, vector; + + cfg = irq_cfg(irq); + BUG_ON(!cfg->vector); + + vector = cfg->vector; + cpus_and(mask, cfg->domain, cpu_online_map); + for_each_cpu_mask_nr(cpu, mask) + per_cpu(vector_irq, cpu)[vector] = -1; + + cfg->vector = 0; + cpus_clear(cfg->domain); +} + +void __setup_vector_irq(int cpu) +{ + /* Initialize vector_irq on a new cpu */ + /* This function must be called with vector_lock held */ + int irq, vector; + struct irq_cfg *cfg; + + /* Mark the inuse vectors */ + for_each_irq_cfg(cfg) { + if (!cpu_isset(cpu, cfg->domain)) + continue; + vector = cfg->vector; + irq = cfg->irq; + per_cpu(vector_irq, cpu)[vector] = irq; + } + /* Mark the free vectors */ + for (vector = 0; vector < NR_VECTORS; ++vector) { + irq = per_cpu(vector_irq, cpu)[vector]; + if (irq < 0) + continue; + + cfg = irq_cfg(irq); + if (!cpu_isset(cpu, cfg->domain)) + per_cpu(vector_irq, cpu)[vector] = -1; + } } static struct irq_chip ioapic_chip; @@ -1095,7 +1186,7 @@ static struct irq_chip ioapic_chip; #define IOAPIC_EDGE 0 #define IOAPIC_LEVEL 1 -static void ioapic_register_intr(int irq, int vector, unsigned long trigger) +static void ioapic_register_intr(int irq, unsigned long trigger) { struct irq_desc *desc; @@ -1115,79 +1206,109 @@ static void ioapic_register_intr(int irq, int vector, unsigned long trigger) set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_edge_irq, "edge"); } - set_intr_gate(vector, interrupt[irq]); } -static void __init setup_IO_APIC_irqs(void) +static int setup_ioapic_entry(int apic, int irq, + struct IO_APIC_route_entry *entry, + unsigned int destination, int trigger, + int polarity, int vector) { + /* + * add it to the IO-APIC irq-routing table: + */ + memset(entry,0,sizeof(*entry)); + + entry->delivery_mode = INT_DELIVERY_MODE; + entry->dest_mode = INT_DEST_MODE; + entry->dest.logical.logical_dest = destination; + + entry->mask = 0; /* enable IRQ */ + entry->trigger = trigger; + entry->polarity = polarity; + entry->vector = vector; + + /* Mask level triggered irqs. + * Use IRQ_DELAYED_DISABLE for edge triggered irqs. + */ + if (trigger) + entry->mask = 1; + + return 0; +} + +static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, + int trigger, int polarity) +{ + struct irq_cfg *cfg; struct IO_APIC_route_entry entry; - int apic, pin, idx, irq, first_notcon = 1, vector; + cpumask_t mask; + + if (!IO_APIC_IRQ(irq)) + return; + + cfg = irq_cfg(irq); + + mask = TARGET_CPUS; + if (assign_irq_vector(irq, mask)) + return; + + cpus_and(mask, cfg->domain, mask); + + apic_printk(APIC_VERBOSE,KERN_DEBUG + "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " + "IRQ %d Mode:%i Active:%i)\n", + apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector, + irq, trigger, polarity); + + + if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry, + cpu_mask_to_apicid(mask), trigger, polarity, + cfg->vector)) { + printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", + mp_ioapics[apic].mp_apicid, pin); + __clear_irq_vector(irq); + return; + } + + ioapic_register_intr(irq, trigger); + if (irq < 16) + disable_8259A_irq(irq); + + ioapic_write_entry(apic, pin, entry); +} + +static void __init setup_IO_APIC_irqs(void) +{ + int apic, pin, idx, irq, first_notcon = 1; apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); for (apic = 0; apic < nr_ioapics; apic++) { for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - /* - * add it to the IO-APIC irq-routing table: - */ - memset(&entry, 0, sizeof(entry)); - - entry.delivery_mode = INT_DELIVERY_MODE; - entry.dest_mode = INT_DEST_MODE; - entry.mask = 0; /* enable IRQ */ - entry.dest.logical.logical_dest = - cpu_mask_to_apicid(TARGET_CPUS); - - idx = find_irq_entry(apic, pin, mp_INT); + idx = find_irq_entry(apic,pin,mp_INT); if (idx == -1) { if (first_notcon) { - apic_printk(APIC_VERBOSE, KERN_DEBUG - " IO-APIC (apicid-pin) %d-%d", - mp_ioapics[apic].mp_apicid, - pin); + apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin); first_notcon = 0; } else - apic_printk(APIC_VERBOSE, ", %d-%d", - mp_ioapics[apic].mp_apicid, pin); + apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin); continue; } - if (!first_notcon) { apic_printk(APIC_VERBOSE, " not connected.\n"); first_notcon = 1; } - entry.trigger = irq_trigger(idx); - entry.polarity = irq_polarity(idx); - - if (irq_trigger(idx)) { - entry.trigger = 1; - entry.mask = 1; - } - irq = pin_2_irq(idx, apic, pin); - /* - * skip adding the timer int on secondary nodes, which causes - * a small but painful rift in the time-space continuum - */ - if (multi_timer_check(apic, irq)) - continue; - else - add_pin_to_irq(irq, apic, pin); - if (!apic && !IO_APIC_IRQ(irq)) - continue; + if (multi_timer_check(apic, irq)) + continue; - if (IO_APIC_IRQ(irq)) { - vector = assign_irq_vector(irq); - entry.vector = vector; - ioapic_register_intr(irq, vector, IOAPIC_AUTO); + add_pin_to_irq(irq, apic, pin); - if (!apic && (irq < 16)) - disable_8259A_irq(irq); - } - ioapic_write_entry(apic, pin, entry); + setup_IO_APIC_irq(apic, pin, irq, + irq_trigger(idx), irq_polarity(idx)); } } @@ -1221,7 +1342,7 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, * The timer IRQ doesn't have to know that behind the * scene we may have a 8259A-master in AEOI mode ... */ - ioapic_register_intr(0, vector, IOAPIC_EDGE); + ioapic_register_intr(0, IOAPIC_EDGE); /* * Add it to the IO-APIC irq-routing table: @@ -1805,8 +1926,10 @@ static unsigned int startup_ioapic_irq(unsigned int irq) return was_pending; } +static void irq_complete_move(unsigned int irq); static void ack_ioapic_irq(unsigned int irq) { + irq_complete_move(irq); move_native_irq(irq); ack_APIC_irq(); } @@ -1816,6 +1939,7 @@ static void ack_ioapic_quirk_irq(unsigned int irq) unsigned long v; int i; + irq_complete_move(irq); move_native_irq(irq); /* * It appears there is an erratum which affects at least version 0x11 @@ -1858,6 +1982,64 @@ static int ioapic_retrigger_irq(unsigned int irq) return 1; } +#ifdef CONFIG_SMP +asmlinkage void smp_irq_move_cleanup_interrupt(void) +{ + unsigned vector, me; + ack_APIC_irq(); + irq_enter(); + + me = smp_processor_id(); + for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { + unsigned int irq; + struct irq_desc *desc; + struct irq_cfg *cfg; + irq = __get_cpu_var(vector_irq)[vector]; + + desc = irq_to_desc(irq); + if (!desc) + continue; + + cfg = irq_cfg(irq); + spin_lock(&desc->lock); + if (!cfg->move_cleanup_count) + goto unlock; + + if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) + goto unlock; + + __get_cpu_var(vector_irq)[vector] = -1; + cfg->move_cleanup_count--; +unlock: + spin_unlock(&desc->lock); + } + + irq_exit(); +} + +static void irq_complete_move(unsigned int irq) +{ + struct irq_cfg *cfg = irq_cfg(irq); + unsigned vector, me; + + if (likely(!cfg->move_in_progress)) + return; + + vector = ~get_irq_regs()->orig_ax; + me = smp_processor_id(); + if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { + cpumask_t cleanup_mask; + + cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); + cfg->move_cleanup_count = cpus_weight(cleanup_mask); + send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + cfg->move_in_progress = 0; + } +} +#else +static inline void irq_complete_move(unsigned int irq) {} +#endif + static struct irq_chip ioapic_chip __read_mostly = { .name = "IO-APIC", .startup = startup_ioapic_irq, @@ -1940,7 +2122,7 @@ static struct irq_chip lapic_chip __read_mostly = { .ack = ack_lapic_irq, }; -static void lapic_register_intr(int irq, int vector) +static void lapic_register_intr(int irq) { struct irq_desc *desc; @@ -1948,7 +2130,6 @@ static void lapic_register_intr(int irq, int vector) desc->status &= ~IRQ_LEVEL; set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, "edge"); - set_intr_gate(vector, interrupt[irq]); } static void __init setup_nmi(void) @@ -2036,9 +2217,9 @@ static inline void __init unlock_ExtINT_logic(void) */ static inline void __init check_timer(void) { + struct irq_cfg *cfg = irq_cfg(0); int apic1, pin1, apic2, pin2; int no_pin1 = 0; - int vector; unsigned int ver; unsigned long flags; @@ -2051,8 +2232,7 @@ static inline void __init check_timer(void) * get/set the timer IRQ vector: */ disable_8259A_irq(0); - vector = assign_irq_vector(0); - set_intr_gate(vector, interrupt[0]); + assign_irq_vector(0, TARGET_CPUS); /* * As IRQ0 is to be enabled in the 8259A, the virtual @@ -2074,7 +2254,7 @@ static inline void __init check_timer(void) apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " "apic1=%d pin1=%d apic2=%d pin2=%d\n", - vector, apic1, pin1, apic2, pin2); + cfg->vector, apic1, pin1, apic2, pin2); /* * Some BIOS writers are clueless and report the ExtINTA @@ -2098,7 +2278,7 @@ static inline void __init check_timer(void) */ if (no_pin1) { add_pin_to_irq(0, apic1, pin1); - setup_timer_IRQ0_pin(apic1, pin1, vector); + setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); } unmask_IO_APIC_irq(0); if (timer_irq_works()) { @@ -2123,7 +2303,7 @@ static inline void __init check_timer(void) * legacy devices should be connected to IO APIC #0 */ replace_pin_at_irq(0, apic1, pin1, apic2, pin2); - setup_timer_IRQ0_pin(apic2, pin2, vector); + setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); unmask_IO_APIC_irq(0); enable_8259A_irq(0); if (timer_irq_works()) { @@ -2154,8 +2334,8 @@ static inline void __init check_timer(void) apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...\n"); - lapic_register_intr(0, vector); - apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ + lapic_register_intr(0); + apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ enable_8259A_irq(0); if (timer_irq_works()) { @@ -2163,7 +2343,7 @@ static inline void __init check_timer(void) goto out; } disable_8259A_irq(0); - apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); apic_printk(APIC_QUIET, KERN_INFO @@ -2207,12 +2387,6 @@ out: void __init setup_IO_APIC(void) { - int i; - - /* Reserve all the system vectors. */ - for (i = first_system_vector; i < NR_VECTORS; i++) - set_bit(i, used_vectors); - enable_IO_APIC(); io_apic_irqs = ~PIC_IRQS; @@ -2334,12 +2508,14 @@ device_initcall(ioapic_init_sysfs); unsigned int create_irq_nr(unsigned int irq_want) { /* Allocate an unused irq */ - unsigned int irq, new, vector = 0; + unsigned int irq, new; unsigned long flags; struct irq_cfg *cfg_new; +#ifndef CONFIG_HAVE_SPARSE_IRQ /* only can use bus/dev/fn.. when per_cpu vector is used */ irq_want = nr_irqs - 1; +#endif irq = 0; spin_lock_irqsave(&vector_lock, flags); @@ -2351,15 +2527,13 @@ unsigned int create_irq_nr(unsigned int irq_want) continue; if (!cfg_new) cfg_new = irq_cfg_alloc(new); - vector = __assign_irq_vector(new); - if (likely(vector > 0)) + if (__assign_irq_vector(new, TARGET_CPUS) == 0) irq = new; break; } spin_unlock_irqrestore(&vector_lock, flags); if (irq > 0) { - set_intr_gate(vector, interrupt[irq]); dynamic_irq_init(irq); } return irq; @@ -2377,8 +2551,7 @@ void destroy_irq(unsigned int irq) dynamic_irq_cleanup(irq); spin_lock_irqsave(&vector_lock, flags); - clear_bit(irq_cfg(irq)->vector, used_vectors); - irq_cfg(irq)->vector = 0; + __clear_irq_vector(irq); spin_unlock_irqrestore(&vector_lock, flags); } @@ -2388,57 +2561,65 @@ void destroy_irq(unsigned int irq) #ifdef CONFIG_PCI_MSI static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) { - int vector; + struct irq_cfg *cfg; + int err; unsigned dest; + cpumask_t tmp; - vector = assign_irq_vector(irq); - if (vector >= 0) { - dest = cpu_mask_to_apicid(TARGET_CPUS); - - msg->address_hi = MSI_ADDR_BASE_HI; - msg->address_lo = - MSI_ADDR_BASE_LO | - ((INT_DEST_MODE == 0) ? -MSI_ADDR_DEST_MODE_PHYSICAL: - MSI_ADDR_DEST_MODE_LOGICAL) | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? - MSI_ADDR_REDIRECTION_CPU: - MSI_ADDR_REDIRECTION_LOWPRI) | - MSI_ADDR_DEST_ID(dest); + tmp = TARGET_CPUS; + err = assign_irq_vector(irq, tmp); + if (err) + return err; - msg->data = - MSI_DATA_TRIGGER_EDGE | - MSI_DATA_LEVEL_ASSERT | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? -MSI_DATA_DELIVERY_FIXED: - MSI_DATA_DELIVERY_LOWPRI) | - MSI_DATA_VECTOR(vector); - } - return vector; + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, tmp); + dest = cpu_mask_to_apicid(tmp); + + msg->address_hi = MSI_ADDR_BASE_HI; + msg->address_lo = + MSI_ADDR_BASE_LO | + ((INT_DEST_MODE == 0) ? + MSI_ADDR_DEST_MODE_PHYSICAL: + MSI_ADDR_DEST_MODE_LOGICAL) | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + MSI_ADDR_REDIRECTION_CPU: + MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_DEST_ID(dest); + + msg->data = + MSI_DATA_TRIGGER_EDGE | + MSI_DATA_LEVEL_ASSERT | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + MSI_DATA_DELIVERY_FIXED: + MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_VECTOR(cfg->vector); + + return err; } #ifdef CONFIG_SMP static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) { + struct irq_cfg *cfg; struct msi_msg msg; unsigned int dest; cpumask_t tmp; - int vector; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) - tmp = TARGET_CPUS; + return; - vector = assign_irq_vector(irq); - if (vector < 0) + if (assign_irq_vector(irq, mask)) return; - dest = cpu_mask_to_apicid(mask); + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); read_msi_msg(irq, &msg); msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(vector); + msg.data |= MSI_DATA_VECTOR(cfg->vector); msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; msg.address_lo |= MSI_ADDR_DEST_ID(dest); @@ -2517,15 +2698,15 @@ void arch_teardown_msi_irq(unsigned int irq) #ifdef CONFIG_SMP -static void target_ht_irq(unsigned int irq, unsigned int dest) +static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) { struct ht_irq_msg msg; fetch_ht_irq_msg(irq, &msg); - msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK); + msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK); msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); - msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest); + msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest); msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); write_ht_irq_msg(irq, &msg); @@ -2533,18 +2714,22 @@ static void target_ht_irq(unsigned int irq, unsigned int dest) static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) { + struct irq_cfg *cfg; unsigned int dest; cpumask_t tmp; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) - tmp = TARGET_CPUS; + return; - cpus_and(mask, tmp, CPU_MASK_ALL); + if (assign_irq_vector(irq, mask)) + return; - dest = cpu_mask_to_apicid(mask); + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); - target_ht_irq(irq, dest); + target_ht_irq(irq, dest, cfg->vector); irq_to_desc(irq)->affinity = mask; } #endif @@ -2562,16 +2747,18 @@ static struct irq_chip ht_irq_chip = { int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) { - int vector; + struct irq_cfg *cfg; + int err; + cpumask_t tmp; - vector = assign_irq_vector(irq); - if (vector >= 0) { + tmp = TARGET_CPUS; + err = assign_irq_vector(irq, tmp); + if ( !err) { struct ht_irq_msg msg; unsigned dest; - cpumask_t tmp; - cpus_clear(tmp); - cpu_set(vector >> 8, tmp); + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, tmp); dest = cpu_mask_to_apicid(tmp); msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); @@ -2579,7 +2766,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) msg.address_lo = HT_IRQ_LOW_BASE | HT_IRQ_LOW_DEST_ID(dest) | - HT_IRQ_LOW_VECTOR(vector) | + HT_IRQ_LOW_VECTOR(cfg->vector) | ((INT_DEST_MODE == 0) ? HT_IRQ_LOW_DM_PHYSICAL : HT_IRQ_LOW_DM_LOGICAL) | @@ -2594,7 +2781,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) set_irq_chip_and_handler_name(irq, &ht_irq_chip, handle_edge_irq, "edge"); } - return vector; + return err; } #endif /* CONFIG_HT_IRQ */ @@ -2705,50 +2892,21 @@ int __init io_apic_get_redir_entries(int ioapic) } -int io_apic_set_pci_routing(int ioapic, int pin, int irq, int edge_level, int active_high_low) +int io_apic_set_pci_routing(int ioapic, int pin, int irq, int triggering, int polarity) { - struct IO_APIC_route_entry entry; - if (!IO_APIC_IRQ(irq)) { printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", ioapic); return -EINVAL; } - /* - * Generate a PCI IRQ routing entry and program the IOAPIC accordingly. - * Note that we mask (disable) IRQs now -- these get enabled when the - * corresponding device driver registers for this IRQ. - */ - - memset(&entry, 0, sizeof(entry)); - - entry.delivery_mode = INT_DELIVERY_MODE; - entry.dest_mode = INT_DEST_MODE; - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); - entry.trigger = edge_level; - entry.polarity = active_high_low; - entry.mask = 1; - /* * IRQs < 16 are already in the irq_2_pin[] map */ if (irq >= 16) add_pin_to_irq(irq, ioapic, pin); - entry.vector = assign_irq_vector(irq); - - apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry " - "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic, - mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq, - edge_level, active_high_low); - - ioapic_register_intr(irq, entry.vector, edge_level); - - if (!ioapic && (irq < 16)) - disable_8259A_irq(irq); - - ioapic_write_entry(ioapic, pin, entry); + setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity); return 0; } @@ -2774,6 +2932,47 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) #endif /* CONFIG_ACPI */ +/* + * This function currently is only a helper for the i386 smp boot process where + * we need to reprogram the ioredtbls to cater for the cpus which have come online + * so mask in all cases should simply be TARGET_CPUS + */ +#ifdef CONFIG_SMP +void __init setup_ioapic_dest(void) +{ + int pin, ioapic, irq, irq_entry; + struct irq_cfg *cfg; + struct irq_desc *desc; + + if (skip_ioapic_setup == 1) + return; + + for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { + for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { + irq_entry = find_irq_entry(ioapic, pin, mp_INT); + if (irq_entry == -1) + continue; + irq = pin_2_irq(irq_entry, ioapic, pin); + + /* setup_IO_APIC_irqs could fail to get vector for some device + * when you have too many devices, because at that time only boot + * cpu is online. + */ + cfg = irq_cfg(irq); + if (!cfg->vector) + setup_IO_APIC_irq(ioapic, pin, irq, + irq_trigger(irq_entry), + irq_polarity(irq_entry)); + else { + desc = irq_to_desc(irq); + set_ioapic_affinity_irq(irq, TARGET_CPUS); + } + } + + } +} +#endif + static int __init parse_disable_timer_pin_1(char *arg) { disable_timer_pin_1 = 1; diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 0a57e39159a..b51ffdcfa31 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -223,21 +223,25 @@ unsigned int do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs; /* high bit used in ret_from_ code */ - int overflow, irq = ~regs->orig_ax; + int overflow; + unsigned vector = ~regs->orig_ax; struct irq_desc *desc; + unsigned irq; - desc = irq_to_desc(irq); - if (unlikely(!desc)) { - printk(KERN_EMERG "%s: cannot handle IRQ %d\n", - __func__, irq); - BUG(); - } old_regs = set_irq_regs(regs); irq_enter(); + irq = __get_cpu_var(vector_irq)[vector]; overflow = check_stack_overflow(); + desc = irq_to_desc(irq); + if (unlikely(!desc)) { + printk(KERN_EMERG "%s: cannot handle IRQ %d vector %#x\n", + __func__, irq, vector); + BUG(); + } + if (!execute_on_irq_stack(overflow, desc, irq)) { if (unlikely(overflow)) print_stack_overflow(); diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index ded09ac2642..9092103a18e 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -90,6 +90,27 @@ static struct irqaction irq2 = { .name = "cascade", }; +DEFINE_PER_CPU(vector_irq_t, vector_irq) = { + [0 ... IRQ0_VECTOR - 1] = -1, + [IRQ0_VECTOR] = 0, + [IRQ1_VECTOR] = 1, + [IRQ2_VECTOR] = 2, + [IRQ3_VECTOR] = 3, + [IRQ4_VECTOR] = 4, + [IRQ5_VECTOR] = 5, + [IRQ6_VECTOR] = 6, + [IRQ7_VECTOR] = 7, + [IRQ8_VECTOR] = 8, + [IRQ9_VECTOR] = 9, + [IRQ10_VECTOR] = 10, + [IRQ11_VECTOR] = 11, + [IRQ12_VECTOR] = 12, + [IRQ13_VECTOR] = 13, + [IRQ14_VECTOR] = 14, + [IRQ15_VECTOR] = 15, + [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 +}; + /* Overridden in paravirt.c */ void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); @@ -105,22 +126,14 @@ void __init native_init_IRQ(void) * us. (some of these will be overridden and become * 'special' SMP interrupts) */ - for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { - int vector = FIRST_EXTERNAL_VECTOR + i; - if (i >= nr_irqs) - break; + for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { /* SYSCALL_VECTOR was reserved in trap_init. */ - if (!test_bit(vector, used_vectors)) - set_intr_gate(vector, interrupt[i]); + if (i != SYSCALL_VECTOR) + set_intr_gate(i, interrupt[i]); } -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) - /* - * IRQ0 must be given a fixed assignment and initialized, - * because it's used before the IO-APIC is set up. - */ - set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]); +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) /* * The reschedule interrupt is a CPU-to-CPU reschedule-helper * IPI, driven by wakeup. @@ -135,6 +148,9 @@ void __init native_init_IRQ(void) /* IPI for single call function */ set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt); + + /* Low priority IPI to cleanup after moving an irq */ + set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); #endif #ifdef CONFIG_X86_LOCAL_APIC @@ -168,3 +184,4 @@ void __init native_init_IRQ(void) irq_ctx_init(smp_processor_id()); } + diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 65f0b8a47be..48ee4f9435f 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -582,7 +582,7 @@ static void __init lguest_init_IRQ(void) for (i = 0; i < LGUEST_IRQS; i++) { int vector = FIRST_EXTERNAL_VECTOR + i; if (vector != SYSCALL_VECTOR) { - set_intr_gate(vector, interrupt[i]); + set_intr_gate(vector, interrupt[vector]); set_irq_chip_and_handler_name(i, &lguest_irq_controller, handle_level_irq, "level"); diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c index df37fc9d6a2..3c3b471ea49 100644 --- a/arch/x86/mach-generic/bigsmp.c +++ b/arch/x86/mach-generic/bigsmp.c @@ -41,6 +41,10 @@ static const struct dmi_system_id bigsmp_dmi_table[] = { { } }; +static cpumask_t vector_allocation_domain(int cpu) +{ + return cpumask_of_cpu(cpu); +} static int probe_bigsmp(void) { diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c index 6513d41ea21..28459cab3dd 100644 --- a/arch/x86/mach-generic/es7000.c +++ b/arch/x86/mach-generic/es7000.c @@ -75,4 +75,18 @@ static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) } #endif +static cpumask_t vector_allocation_domain(int cpu) +{ + /* Careful. Some cpus do not strictly honor the set of cpus + * specified in the interrupt destination when using lowest + * priority interrupt delivery mode. + * + * In particular there was a hyperthreading cpu observed to + * deliver interrupts to the wrong hyperthread when only one + * hyperthread was specified in the interrupt desitination. + */ + cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; + return domain; +} + struct genapic __initdata_refok apic_es7000 = APIC_INIT("es7000", probe_es7000); diff --git a/arch/x86/mach-generic/numaq.c b/arch/x86/mach-generic/numaq.c index 8cf58394975..71a309b122e 100644 --- a/arch/x86/mach-generic/numaq.c +++ b/arch/x86/mach-generic/numaq.c @@ -38,4 +38,18 @@ static int acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } +static cpumask_t vector_allocation_domain(int cpu) +{ + /* Careful. Some cpus do not strictly honor the set of cpus + * specified in the interrupt destination when using lowest + * priority interrupt delivery mode. + * + * In particular there was a hyperthreading cpu observed to + * deliver interrupts to the wrong hyperthread when only one + * hyperthread was specified in the interrupt desitination. + */ + cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; + return domain; +} + struct genapic apic_numaq = APIC_INIT("NUMAQ", probe_numaq); diff --git a/arch/x86/mach-generic/summit.c b/arch/x86/mach-generic/summit.c index 6ad6b67a723..6272b5e69da 100644 --- a/arch/x86/mach-generic/summit.c +++ b/arch/x86/mach-generic/summit.c @@ -23,4 +23,18 @@ static int probe_summit(void) return 0; } +static cpumask_t vector_allocation_domain(int cpu) +{ + /* Careful. Some cpus do not strictly honor the set of cpus + * specified in the interrupt destination when using lowest + * priority interrupt delivery mode. + * + * In particular there was a hyperthreading cpu observed to + * deliver interrupts to the wrong hyperthread when only one + * hyperthread was specified in the interrupt desitination. + */ + cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; + return domain; +} + struct genapic apic_summit = APIC_INIT("summit", probe_summit); diff --git a/include/asm-x86/bigsmp/apic.h b/include/asm-x86/bigsmp/apic.h index 0a9cd7c5ca0..1d9543b9d35 100644 --- a/include/asm-x86/bigsmp/apic.h +++ b/include/asm-x86/bigsmp/apic.h @@ -9,22 +9,17 @@ static inline int apic_id_registered(void) return (1); } -/* Round robin the irqs amoung the online cpus */ static inline cpumask_t target_cpus(void) { - static unsigned long cpu = NR_CPUS; - do { - if (cpu >= NR_CPUS) - cpu = first_cpu(cpu_online_map); - else - cpu = next_cpu(cpu, cpu_online_map); - } while (cpu >= NR_CPUS); - return cpumask_of_cpu(cpu); +#ifdef CONFIG_SMP + return cpu_online_map; +#else + return cpumask_of_cpu(0); +#endif } #undef APIC_DEST_LOGICAL #define APIC_DEST_LOGICAL 0 -#define TARGET_CPUS (target_cpus()) #define APIC_DFR_VALUE (APIC_DFR_FLAT) #define INT_DELIVERY_MODE (dest_Fixed) #define INT_DEST_MODE (0) /* phys delivery to target proc */ diff --git a/include/asm-x86/es7000/apic.h b/include/asm-x86/es7000/apic.h index bd2c44d1f7a..750afada5fb 100644 --- a/include/asm-x86/es7000/apic.h +++ b/include/asm-x86/es7000/apic.h @@ -17,7 +17,6 @@ static inline cpumask_t target_cpus(void) return cpumask_of_cpu(smp_processor_id()); #endif } -#define TARGET_CPUS (target_cpus()) #if defined CONFIG_ES7000_CLUSTERED_APIC #define APIC_DFR_VALUE (APIC_DFR_CLUSTER) @@ -81,7 +80,7 @@ static inline void setup_apic_routing(void) int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id()); printk("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n", (apic_version[apic] == 0x14) ? - "Physical Cluster" : "Logical Cluster", nr_ioapics, cpus_addr(TARGET_CPUS)[0]); + "Physical Cluster" : "Logical Cluster", nr_ioapics, cpus_addr(target_cpus())[0]); } static inline int multi_timer_check(int apic, int irq) diff --git a/include/asm-x86/genapic_32.h b/include/asm-x86/genapic_32.h index 34280f02766..6fe4f81bfcf 100644 --- a/include/asm-x86/genapic_32.h +++ b/include/asm-x86/genapic_32.h @@ -57,6 +57,7 @@ struct genapic { unsigned (*get_apic_id)(unsigned long x); unsigned long apic_id_mask; unsigned int (*cpu_mask_to_apicid)(cpumask_t cpumask); + cpumask_t (*vector_allocation_domain)(int cpu); #ifdef CONFIG_SMP /* ipi */ @@ -104,6 +105,7 @@ struct genapic { APICFUNC(get_apic_id) \ .apic_id_mask = APIC_ID_MASK, \ APICFUNC(cpu_mask_to_apicid) \ + APICFUNC(vector_allocation_domain) \ APICFUNC(acpi_madt_oem_check) \ IPIFUNC(send_IPI_mask) \ IPIFUNC(send_IPI_allbutself) \ diff --git a/include/asm-x86/hw_irq.h b/include/asm-x86/hw_irq.h index 50f6e0316b5..51c787d17cb 100644 --- a/include/asm-x86/hw_irq.h +++ b/include/asm-x86/hw_irq.h @@ -116,12 +116,12 @@ extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *); #ifdef CONFIG_X86_32 extern void (*const interrupt[NR_IRQS])(void); -#else +#endif + typedef int vector_irq_t[NR_VECTORS]; DECLARE_PER_CPU(vector_irq_t, vector_irq); -#endif -#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_X86_64) +#ifdef CONFIG_X86_IO_APIC extern void lock_vector_lock(void); extern void unlock_vector_lock(void); extern void __setup_vector_irq(int cpu); diff --git a/include/asm-x86/irq_vectors.h b/include/asm-x86/irq_vectors.h index cb09802ce65..a8d065d85f5 100644 --- a/include/asm-x86/irq_vectors.h +++ b/include/asm-x86/irq_vectors.h @@ -19,19 +19,14 @@ /* * Reserve the lowest usable priority level 0x20 - 0x2f for triggering - * cleanup after irq migration on 64 bit. + * cleanup after irq migration. */ #define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR /* - * Vectors 0x20-0x2f are used for ISA interrupts on 32 bit. - * Vectors 0x30-0x3f are used for ISA interrupts on 64 bit. + * Vectors 0x30-0x3f are used for ISA interrupts. */ -#ifdef CONFIG_X86_32 -#define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR) -#else #define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR + 0x10) -#endif #define IRQ1_VECTOR (IRQ0_VECTOR + 1) #define IRQ2_VECTOR (IRQ0_VECTOR + 2) #define IRQ3_VECTOR (IRQ0_VECTOR + 3) @@ -96,11 +91,7 @@ * start at 0x31(0x41) to spread out vectors evenly between priority * levels. (0x80 is the syscall vector) */ -#ifdef CONFIG_X86_32 -# define FIRST_DEVICE_VECTOR 0x31 -#else -# define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2) -#endif +#define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2) #define NR_VECTORS 256 diff --git a/include/asm-x86/mach-default/entry_arch.h b/include/asm-x86/mach-default/entry_arch.h index 9283b60a1dd..6b1add8e31d 100644 --- a/include/asm-x86/mach-default/entry_arch.h +++ b/include/asm-x86/mach-default/entry_arch.h @@ -14,6 +14,7 @@ BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR) BUILD_INTERRUPT(invalidate_interrupt,INVALIDATE_TLB_VECTOR) BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) +BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) #endif /* diff --git a/include/asm-x86/mach-default/mach_apic.h b/include/asm-x86/mach-default/mach_apic.h index 2a330a41b3d..3c66f2cdaec 100644 --- a/include/asm-x86/mach-default/mach_apic.h +++ b/include/asm-x86/mach-default/mach_apic.h @@ -85,6 +85,20 @@ static inline int apicid_to_node(int logical_apicid) return 0; #endif } + +static inline cpumask_t vector_allocation_domain(int cpu) +{ + /* Careful. Some cpus do not strictly honor the set of cpus + * specified in the interrupt destination when using lowest + * priority interrupt delivery mode. + * + * In particular there was a hyperthreading cpu observed to + * deliver interrupts to the wrong hyperthread when only one + * hyperthread was specified in the interrupt desitination. + */ + cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; + return domain; +} #endif static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) @@ -138,6 +152,5 @@ static inline int check_phys_apicid_present(int boot_cpu_physical_apicid) static inline void enable_apic_mode(void) { } - #endif /* CONFIG_X86_LOCAL_APIC */ #endif /* ASM_X86__MACH_DEFAULT__MACH_APIC_H */ diff --git a/include/asm-x86/mach-generic/mach_apic.h b/include/asm-x86/mach-generic/mach_apic.h index 5d010c6881d..5085b52da30 100644 --- a/include/asm-x86/mach-generic/mach_apic.h +++ b/include/asm-x86/mach-generic/mach_apic.h @@ -24,6 +24,7 @@ #define check_phys_apicid_present (genapic->check_phys_apicid_present) #define check_apicid_used (genapic->check_apicid_used) #define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid) +#define vector_allocation_domain (genapic->vector_allocation_domain) #define enable_apic_mode (genapic->enable_apic_mode) #define phys_pkg_id (genapic->phys_pkg_id) diff --git a/include/asm-x86/numaq/apic.h b/include/asm-x86/numaq/apic.h index a8344ba6ea1..0bf2a06b7a4 100644 --- a/include/asm-x86/numaq/apic.h +++ b/include/asm-x86/numaq/apic.h @@ -12,8 +12,6 @@ static inline cpumask_t target_cpus(void) return CPU_MASK_ALL; } -#define TARGET_CPUS (target_cpus()) - #define NO_BALANCE_IRQ (1) #define esr_disable (1) diff --git a/include/asm-x86/summit/apic.h b/include/asm-x86/summit/apic.h index c5b2e4b1035..0f68037b8f2 100644 --- a/include/asm-x86/summit/apic.h +++ b/include/asm-x86/summit/apic.h @@ -22,7 +22,6 @@ static inline cpumask_t target_cpus(void) */ return cpumask_of_cpu(0); } -#define TARGET_CPUS (target_cpus()) #define INT_DELIVERY_MODE (dest_LowestPrio) #define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */ -- cgit v1.2.3-70-g09d2 From 7a959cff725872ce9c3a534f10724d7bb2cb3c4a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:32 -0700 Subject: x86: add debug info for 32bit sparse_irq so could figure out bugs where we get an interrupt, but vector_irq is not initialized yet. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_32.c | 8 ++++++-- arch/x86/kernel/io_apic_64.c | 2 ++ arch/x86/kernel/irq_32.c | 4 ++-- 3 files changed, 10 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index ea33d3c7497..3001924bdd3 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -1114,8 +1114,12 @@ next: cfg->move_in_progress = 1; cfg->old_domain = cfg->domain; } - for_each_cpu_mask_nr(new_cpu, new_mask) - per_cpu(vector_irq, new_cpu)[vector] = irq; + printk(KERN_DEBUG "assign_irq_vector: irq %d vector %#x cpu ", irq, vector); + for_each_cpu_mask_nr(new_cpu, new_mask) { + per_cpu(vector_irq, new_cpu)[vector] = irq; + printk(KERN_CONT " %d ", new_cpu); + } + printk(KERN_CONT "\n"); cfg->vector = vector; cfg->domain = domain; return 0; diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index b0d4abc55a1..30d2e381131 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -1394,6 +1394,8 @@ __apicdebuginit(void) print_IO_APIC(void) printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); + printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); + printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index b51ffdcfa31..cc929f2f84f 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -237,8 +237,8 @@ unsigned int do_IRQ(struct pt_regs *regs) desc = irq_to_desc(irq); if (unlikely(!desc)) { - printk(KERN_EMERG "%s: cannot handle IRQ %d vector %#x\n", - __func__, irq, vector); + printk(KERN_EMERG "%s: cannot handle IRQ %d vector %#x cpu %d\n", + __func__, irq, vector, smp_processor_id()); BUG(); } -- cgit v1.2.3-70-g09d2 From d83e94acd95789829804fd9e442bd18975f4dc89 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:33 -0700 Subject: x86, io-apic: remove union about dest for log/phy let user decide the meaning of the bits. This unifies the 32-bit and 64-bit io-apic code a bit. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_32.c | 18 +++++++----------- include/asm-x86/io_apic.h | 16 ---------------- 2 files changed, 7 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 3001924bdd3..353e586822a 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -1224,7 +1224,7 @@ static int setup_ioapic_entry(int apic, int irq, entry->delivery_mode = INT_DELIVERY_MODE; entry->dest_mode = INT_DEST_MODE; - entry->dest.logical.logical_dest = destination; + entry->dest = destination; entry->mask = 0; /* enable IRQ */ entry->trigger = trigger; @@ -1336,7 +1336,7 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, */ entry.dest_mode = INT_DEST_MODE; entry.mask = 1; /* mask IRQ now */ - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.dest = cpu_mask_to_apicid(TARGET_CPUS); entry.delivery_mode = INT_DELIVERY_MODE; entry.polarity = 0; entry.trigger = 0; @@ -1425,19 +1425,15 @@ __apicdebuginit(void) print_IO_APIC(void) printk(KERN_DEBUG ".... IRQ redirection table:\n"); - printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol" - " Stat Dest Deli Vect: \n"); + printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" + " Stat Dmod Deli Vect: \n"); for (i = 0; i <= reg_01.bits.entries; i++) { struct IO_APIC_route_entry entry; entry = ioapic_read_entry(apic, i); - printk(KERN_DEBUG " %02x %03X %02X ", - i, - entry.dest.logical.logical_dest, - entry.dest.physical.physical_dest - ); + printk(KERN_DEBUG " %02x %02X ", i, entry.dest); printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", entry.mask, @@ -1717,7 +1713,7 @@ void disable_IO_APIC(void) entry.dest_mode = 0; /* Physical */ entry.delivery_mode = dest_ExtINT; /* ExtInt */ entry.vector = 0; - entry.dest.physical.physical_dest = read_apic_id(); + entry.dest = read_apic_id(); /* * Add it to the IO-APIC irq-routing table: @@ -2185,7 +2181,7 @@ static inline void __init unlock_ExtINT_logic(void) entry1.dest_mode = 0; /* physical delivery */ entry1.mask = 0; /* unmask IRQ now */ - entry1.dest.physical.physical_dest = hard_smp_processor_id(); + entry1.dest = hard_smp_processor_id(); entry1.delivery_mode = dest_ExtINT; entry1.polarity = entry0.polarity; entry1.trigger = 0; diff --git a/include/asm-x86/io_apic.h b/include/asm-x86/io_apic.h index 8ec68a50cf1..ce818292d2c 100644 --- a/include/asm-x86/io_apic.h +++ b/include/asm-x86/io_apic.h @@ -87,24 +87,8 @@ struct IO_APIC_route_entry { mask : 1, /* 0: enabled, 1: disabled */ __reserved_2 : 15; -#ifdef CONFIG_X86_32 - union { - struct { - __u32 __reserved_1 : 24, - physical_dest : 4, - __reserved_2 : 4; - } physical; - - struct { - __u32 __reserved_1 : 24, - logical_dest : 8; - } logical; - } dest; -#else __u32 __reserved_3 : 24, dest : 8; -#endif - } __attribute__ ((packed)); struct IR_IO_APIC_route_entry { -- cgit v1.2.3-70-g09d2 From 1d02519242c23450b043e5e8a9e3cb84a8666fe3 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:34 -0700 Subject: x86: ordering functions in io_apic_32.c prepare for unification: try to make functions be of the same order to io_apic_64.c. v2: add calling setup_msi_irq back to arch_setup_msi_irq Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_32.c | 178 +++++++++++++++++++++++-------------------- 1 file changed, 94 insertions(+), 84 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 353e586822a..9531ef33362 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -1029,23 +1029,6 @@ static int pin_2_irq(int idx, int apic, int pin) return irq; } -static inline int IO_APIC_irq_trigger(int irq) -{ - int apic, idx, pin; - - for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - idx = find_irq_entry(apic, pin, mp_INT); - if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) - return irq_trigger(idx); - } - } - /* - * nonexistent IRQs are edge default - */ - return 0; -} - void lock_vector_lock(void) { /* Used to the online set of cpus does not change @@ -1190,6 +1173,23 @@ static struct irq_chip ioapic_chip; #define IOAPIC_EDGE 0 #define IOAPIC_LEVEL 1 +static inline int IO_APIC_irq_trigger(int irq) +{ + int apic, idx, pin; + + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + idx = find_irq_entry(apic, pin, mp_INT); + if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) + return irq_trigger(idx); + } + } + /* + * nonexistent IRQs are edge default + */ + return 0; +} + static void ioapic_register_intr(int irq, unsigned long trigger) { struct irq_desc *desc; @@ -1926,55 +1926,6 @@ static unsigned int startup_ioapic_irq(unsigned int irq) return was_pending; } -static void irq_complete_move(unsigned int irq); -static void ack_ioapic_irq(unsigned int irq) -{ - irq_complete_move(irq); - move_native_irq(irq); - ack_APIC_irq(); -} - -static void ack_ioapic_quirk_irq(unsigned int irq) -{ - unsigned long v; - int i; - - irq_complete_move(irq); - move_native_irq(irq); -/* - * It appears there is an erratum which affects at least version 0x11 - * of I/O APIC (that's the 82093AA and cores integrated into various - * chipsets). Under certain conditions a level-triggered interrupt is - * erroneously delivered as edge-triggered one but the respective IRR - * bit gets set nevertheless. As a result the I/O unit expects an EOI - * message but it will never arrive and further interrupts are blocked - * from the source. The exact reason is so far unknown, but the - * phenomenon was observed when two consecutive interrupt requests - * from a given source get delivered to the same CPU and the source is - * temporarily disabled in between. - * - * A workaround is to simulate an EOI message manually. We achieve it - * by setting the trigger mode to edge and then to level when the edge - * trigger mode gets detected in the TMR of a local APIC for a - * level-triggered interrupt. We mask the source for the time of the - * operation to prevent an edge-triggered interrupt escaping meanwhile. - * The idea is from Manfred Spraul. --macro - */ - i = irq_cfg(irq)->vector; - - v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); - - ack_APIC_irq(); - - if (!(v & (1 << (i & 0x1f)))) { - atomic_inc(&irq_mis_count); - spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(irq); - __unmask_and_level_IO_APIC_irq(irq); - spin_unlock(&ioapic_lock); - } -} - static int ioapic_retrigger_irq(unsigned int irq) { send_IPI_self(irq_cfg(irq)->vector); @@ -2040,13 +1991,61 @@ static void irq_complete_move(unsigned int irq) static inline void irq_complete_move(unsigned int irq) {} #endif +static void ack_apic_edge(unsigned int irq) +{ + irq_complete_move(irq); + move_native_irq(irq); + ack_APIC_irq(); +} + +static void ack_apic_level(unsigned int irq) +{ + unsigned long v; + int i; + + irq_complete_move(irq); + move_native_irq(irq); +/* + * It appears there is an erratum which affects at least version 0x11 + * of I/O APIC (that's the 82093AA and cores integrated into various + * chipsets). Under certain conditions a level-triggered interrupt is + * erroneously delivered as edge-triggered one but the respective IRR + * bit gets set nevertheless. As a result the I/O unit expects an EOI + * message but it will never arrive and further interrupts are blocked + * from the source. The exact reason is so far unknown, but the + * phenomenon was observed when two consecutive interrupt requests + * from a given source get delivered to the same CPU and the source is + * temporarily disabled in between. + * + * A workaround is to simulate an EOI message manually. We achieve it + * by setting the trigger mode to edge and then to level when the edge + * trigger mode gets detected in the TMR of a local APIC for a + * level-triggered interrupt. We mask the source for the time of the + * operation to prevent an edge-triggered interrupt escaping meanwhile. + * The idea is from Manfred Spraul. --macro + */ + i = irq_cfg(irq)->vector; + + v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); + + ack_APIC_irq(); + + if (!(v & (1 << (i & 0x1f)))) { + atomic_inc(&irq_mis_count); + spin_lock(&ioapic_lock); + __mask_and_edge_IO_APIC_irq(irq); + __unmask_and_level_IO_APIC_irq(irq); + spin_unlock(&ioapic_lock); + } +} + static struct irq_chip ioapic_chip __read_mostly = { .name = "IO-APIC", .startup = startup_ioapic_irq, .mask = mask_IO_APIC_irq, .unmask = unmask_IO_APIC_irq, - .ack = ack_ioapic_irq, - .eoi = ack_ioapic_quirk_irq, + .ack = ack_apic_edge, + .eoi = ack_apic_level, #ifdef CONFIG_SMP .set_affinity = set_ioapic_affinity_irq, #endif @@ -2094,11 +2093,6 @@ static inline void init_IO_APIC_traps(void) * The local APIC irq-chip implementation: */ -static void ack_lapic_irq(unsigned int irq) -{ - ack_APIC_irq(); -} - static void mask_lapic_irq(unsigned int irq) { unsigned long v; @@ -2115,6 +2109,11 @@ static void unmask_lapic_irq(unsigned int irq) apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); } +static void ack_lapic_irq(unsigned int irq) +{ + ack_APIC_irq(); +} + static struct irq_chip lapic_chip __read_mostly = { .name = "local-APIC", .mask = mask_lapic_irq, @@ -2636,13 +2635,31 @@ static struct irq_chip msi_chip = { .name = "PCI-MSI", .unmask = unmask_msi_irq, .mask = mask_msi_irq, - .ack = ack_ioapic_irq, + .ack = ack_apic_edge, #ifdef CONFIG_SMP .set_affinity = set_msi_irq_affinity, #endif .retrigger = ioapic_retrigger_irq, }; + +static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) +{ + int ret; + struct msi_msg msg; + + ret = msi_compose_msg(dev, irq, &msg); + if (ret < 0) + return ret; + + set_irq_msi(irq, desc); + write_msi_msg(irq, &msg); + + set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); + + return 0; +} + static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) { unsigned int irq; @@ -2657,7 +2674,6 @@ static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) { - struct msi_msg msg; int irq, ret; unsigned int irq_want; @@ -2669,17 +2685,11 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) if (irq == 0) return -1; - ret = msi_compose_msg(dev, irq, &msg); + ret = setup_msi_irq(dev, desc, irq); if (ret < 0) { destroy_irq(irq); return ret; - } - - set_irq_msi(irq, desc); - write_msi_msg(irq, &msg); - - set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, - "edge"); + } return 0; } @@ -2738,7 +2748,7 @@ static struct irq_chip ht_irq_chip = { .name = "PCI-HT", .mask = mask_ht_irq, .unmask = unmask_ht_irq, - .ack = ack_ioapic_irq, + .ack = ack_apic_edge, #ifdef CONFIG_SMP .set_affinity = set_ht_irq_affinity, #endif -- cgit v1.2.3-70-g09d2 From 8ea5371baa82db452a8d93e9977b418d30944e32 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:35 -0700 Subject: x86: ordering functions in io_apic_64.c try to make functions have the same order between 32-bit and 64-bit. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_64.c | 67 ++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 30d2e381131..07e1e45ee02 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -409,40 +409,6 @@ static bool io_apic_level_ack_pending(unsigned int irq) return false; } -/* - * Synchronize the IO-APIC and the CPU by doing - * a dummy read from the IO-APIC - */ -static inline void io_apic_sync(unsigned int apic) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - readl(&io_apic->data); -} - -#define __DO_ACTION(R, ACTION, FINAL) \ - \ -{ \ - int pin; \ - struct irq_cfg *cfg; \ - struct irq_pin_list *entry; \ - \ - cfg = irq_cfg(irq); \ - entry = cfg->irq_2_pin; \ - for (;;) { \ - unsigned int reg; \ - if (!entry) \ - break; \ - pin = entry->pin; \ - reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ - reg ACTION; \ - io_apic_modify(entry->apic, reg); \ - FINAL; \ - if (!entry->next) \ - break; \ - entry = entry->next; \ - } \ -} - union entry_union { struct { u32 w1, w2; }; struct IO_APIC_route_entry entry; @@ -627,6 +593,39 @@ static void __init replace_pin_at_irq(unsigned int irq, add_pin_to_irq(irq, newapic, newpin); } +/* + * Synchronize the IO-APIC and the CPU by doing + * a dummy read from the IO-APIC + */ +static inline void io_apic_sync(unsigned int apic) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + readl(&io_apic->data); +} + +#define __DO_ACTION(R, ACTION, FINAL) \ + \ +{ \ + int pin; \ + struct irq_cfg *cfg; \ + struct irq_pin_list *entry; \ + \ + cfg = irq_cfg(irq); \ + entry = cfg->irq_2_pin; \ + for (;;) { \ + unsigned int reg; \ + if (!entry) \ + break; \ + pin = entry->pin; \ + reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ + reg ACTION; \ + io_apic_modify(entry->apic, reg); \ + FINAL; \ + if (!entry->next) \ + break; \ + entry = entry->next; \ + } \ +} #define DO_ACTION(name,R,ACTION, FINAL) \ \ -- cgit v1.2.3-70-g09d2 From efa2559f65167989f1893cb065e3126d4f13ba60 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:36 -0700 Subject: x86: order variables in io_apic_xx.c move first_system_vector to apic_64.c. also add #ifdef CONFIG_INTR_REMAP to prepare 32 bit to use same file. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_64.c | 5 ++ arch/x86/kernel/io_apic_32.c | 79 ++++++++++----------- arch/x86/kernel/io_apic_64.c | 160 +++++++++++++++++++++++-------------------- 3 files changed, 127 insertions(+), 117 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 94ddb69ae15..4d7a188025b 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -59,6 +60,10 @@ int x2apic_preenabled; int local_apic_timer_c2_ok; EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); +int first_system_vector = 0xfe; + +char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; + /* * Debug level, exported for io_apic.c */ diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 9531ef33362..3010bdd3352 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -54,24 +54,22 @@ #define __apicdebuginit(type) static type __init -int (*ioapic_renumber_irq)(int ioapic, int irq); -atomic_t irq_mis_count; - -/* Where if anywhere is the i8259 connect in external int mode */ -static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; - -static DEFINE_SPINLOCK(ioapic_lock); -static DEFINE_SPINLOCK(vector_lock); - -int timer_through_8259 __initdata; - /* * Is the SiS APIC rmw bug present ? * -1 = don't know, 0 = no, 1 = yes */ int sis_apic_bug = -1; +static DEFINE_SPINLOCK(ioapic_lock); +static DEFINE_SPINLOCK(vector_lock); + int first_free_entry; +/* + * Rough estimation of how many shared IRQs there are, can + * be changed anytime. + */ +int pin_map_size; + /* * # of IRQ routing registers */ @@ -93,7 +91,15 @@ int mp_bus_id_to_type[MAX_MP_BUSSES]; DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); -static int disable_timer_pin_1 __initdata; +int skip_ioapic_setup; + +static int __init parse_noapic(char *arg) +{ + /* disable IO-APIC */ + disable_ioapic_setup(); + return 0; +} +early_param("noapic", parse_noapic); struct irq_cfg; struct irq_pin_list; @@ -268,13 +274,6 @@ static struct irq_cfg *irq_cfg_alloc(unsigned int irq) return cfg; } -static int assign_irq_vector(int irq, cpumask_t mask); -/* - * Rough estimation of how many shared IRQs there are, can - * be changed anytime. - */ -int pin_map_size; - /* * This is performance-critical, we want to do it O(1) * @@ -465,6 +464,9 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) entry = entry->next; } } + +static int assign_irq_vector(int irq, cpumask_t mask); + static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) { struct irq_cfg *cfg; @@ -677,7 +679,6 @@ void send_IPI_self(int vector) #define MAX_PIRQS 8 static int pirq_entries [MAX_PIRQS]; static int pirqs_enabled; -int skip_ioapic_setup; static int __init ioapic_pirq_setup(char *str) { @@ -981,6 +982,7 @@ static inline int irq_trigger(int idx) return MPBIOS_trigger(idx); } +int (*ioapic_renumber_irq)(int ioapic, int irq); static int pin_2_irq(int idx, int apic, int pin) { int irq, i; @@ -1621,6 +1623,9 @@ __apicdebuginit(int) print_all_ICs(void) fs_initcall(print_all_ICs); +/* Where if anywhere is the i8259 connect in external int mode */ +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; + static void __init enable_IO_APIC(void) { union IO_APIC_reg_01 reg_01; @@ -1998,6 +2003,7 @@ static void ack_apic_edge(unsigned int irq) ack_APIC_irq(); } +atomic_t irq_mis_count; static void ack_apic_level(unsigned int irq) { unsigned long v; @@ -2208,6 +2214,17 @@ static inline void __init unlock_ExtINT_logic(void) ioapic_write_entry(apic, pin, entry0); } +static int disable_timer_pin_1 __initdata; + +static int __init parse_disable_timer_pin_1(char *arg) +{ + disable_timer_pin_1 = 1; + return 0; +} +early_param("disable_timer_pin_1", parse_disable_timer_pin_1); + +int timer_through_8259 __initdata; + /* * This code may look a bit paranoid, but it's supposed to cooperate with * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ @@ -2983,28 +3000,6 @@ void __init setup_ioapic_dest(void) } #endif -static int __init parse_disable_timer_pin_1(char *arg) -{ - disable_timer_pin_1 = 1; - return 0; -} -early_param("disable_timer_pin_1", parse_disable_timer_pin_1); - -static int __init parse_enable_timer_pin_1(char *arg) -{ - disable_timer_pin_1 = -1; - return 0; -} -early_param("enable_timer_pin_1", parse_enable_timer_pin_1); - -static int __init parse_noapic(char *arg) -{ - /* disable IO-APIC */ - disable_ioapic_setup(); - return 0; -} -early_param("noapic", parse_noapic); - void __init ioapic_init_mappings(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 07e1e45ee02..847aa798764 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -57,6 +57,47 @@ #define __apicdebuginit(type) static type __init +int ioapic_force; + +int sis_apic_bug; /* not actually supported, dummy for compile */ + +static DEFINE_SPINLOCK(ioapic_lock); +static DEFINE_SPINLOCK(vector_lock); + +int first_free_entry; +/* + * Rough estimation of how many shared IRQs there are, can + * be changed anytime. + */ +int pin_map_size; + +/* + * # of IRQ routing registers + */ +int nr_ioapic_registers[MAX_IO_APICS]; + +/* I/O APIC entries */ +struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; +int nr_ioapics; + +/* MP IRQ source entries */ +struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; + +/* # of MP IRQ source entries */ +int mp_irq_entries; + +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); + +int skip_ioapic_setup; + +static int __init parse_noapic(char *str) +{ + disable_ioapic_setup(); + return 0; +} +early_param("noapic", parse_noapic); + + struct irq_cfg; struct irq_pin_list; struct irq_cfg { @@ -228,53 +269,6 @@ static struct irq_cfg *irq_cfg_alloc(unsigned int irq) return cfg; } -static int assign_irq_vector(int irq, cpumask_t mask); - -int first_system_vector = 0xfe; - -char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; - -int sis_apic_bug; /* not actually supported, dummy for compile */ - -static int no_timer_check; - -static int disable_timer_pin_1 __initdata; - -int timer_through_8259 __initdata; - -/* Where if anywhere is the i8259 connect in external int mode */ -static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; - -static DEFINE_SPINLOCK(ioapic_lock); -static DEFINE_SPINLOCK(vector_lock); - -/* - * # of IRQ routing registers - */ -int nr_ioapic_registers[MAX_IO_APICS]; - -/* I/O APIC RTE contents at the OS boot up */ -struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS]; - -/* I/O APIC entries */ -struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; -int nr_ioapics; - -/* MP IRQ source entries */ -struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; - -/* # of MP IRQ source entries */ -int mp_irq_entries; - -DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); - -/* - * Rough estimation of how many shared IRQs there are, can - * be changed anytime. - */ - -int pin_map_size; - /* * This is performance-critical, we want to do it O(1) * @@ -481,12 +475,16 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) apic = entry->apic; pin = entry->pin; +#ifdef CONFIG_INTR_REMAP /* * With interrupt-remapping, destination information comes * from interrupt-remapping table entry. */ if (!irq_remapped(irq)) io_apic_write(apic, 0x11 + pin*2, dest); +#else + io_apic_write(apic, 0x11 + pin*2, dest); +#endif reg = io_apic_read(apic, 0x10 + pin*2); reg &= ~IO_APIC_REDIR_VECTOR_MASK; reg |= vector; @@ -497,6 +495,8 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) } } +static int assign_irq_vector(int irq, cpumask_t mask); + static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) { struct irq_cfg *cfg = irq_cfg(irq); @@ -533,7 +533,6 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -int first_free_entry; static void add_pin_to_irq(unsigned int irq, int apic, int pin) { struct irq_cfg *cfg; @@ -679,6 +678,10 @@ static void clear_IO_APIC (void) clear_IO_APIC_pin(apic, pin); } +#ifdef CONFIG_INTR_REMAP +/* I/O APIC RTE contents at the OS boot up */ +static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS]; + /* * Saves and masks all the unmasked IO-APIC RTE's */ @@ -741,25 +744,7 @@ void reinit_intr_remapped_IO_APIC(int intr_remapping) */ restore_IO_APIC_setup(); } - -int skip_ioapic_setup; -int ioapic_force; - -static int __init parse_noapic(char *str) -{ - disable_ioapic_setup(); - return 0; -} -early_param("noapic", parse_noapic); - -/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ -static int __init disable_timer_pin_setup(char *arg) -{ - disable_timer_pin_1 = 1; - return 1; -} -__setup("disable_timer_pin_1", disable_timer_pin_setup); - +#endif /* * Find the IRQ entry number of a certain pin. @@ -1327,8 +1312,10 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, { struct IO_APIC_route_entry entry; +#ifdef CONFIG_INTR_REMAP if (intr_remapping_enabled) return; +#endif memset(&entry, 0, sizeof(entry)); @@ -1601,6 +1588,9 @@ __apicdebuginit(int) print_all_ICs(void) fs_initcall(print_all_ICs); +/* Where if anywhere is the i8259 connect in external int mode */ +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; + void __init enable_IO_APIC(void) { union IO_APIC_reg_01 reg_01; @@ -1695,6 +1685,15 @@ void disable_IO_APIC(void) disconnect_bsp_APIC(ioapic_i8259.pin != -1); } +static int no_timer_check; + +static int __init notimercheck(char *s) +{ + no_timer_check = 1; + return 1; +} +__setup("no_timer_check", notimercheck); + /* * There is a nasty bug in some older SMP boards, their mptable lies * about the timer IRQ. We do the following to work around the situation: @@ -1708,6 +1707,9 @@ static int __init timer_irq_works(void) unsigned long t1 = jiffies; unsigned long flags; + if (no_timer_check) + return 1; + local_save_flags(flags); local_irq_enable(); /* Let ten ticks pass... */ @@ -2239,6 +2241,17 @@ static inline void __init unlock_ExtINT_logic(void) ioapic_write_entry(apic, pin, entry0); } +static int disable_timer_pin_1 __initdata; +/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ +static int __init disable_timer_pin_setup(char *arg) +{ + disable_timer_pin_1 = 1; + return 0; +} +early_param("disable_timer_pin_1", disable_timer_pin_setup); + +int timer_through_8259 __initdata; + /* * This code may look a bit paranoid, but it's supposed to cooperate with * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ @@ -2286,8 +2299,10 @@ static inline void __init check_timer(void) * 8259A. */ if (pin1 == -1) { +#ifdef CONFIG_INTR_REMAP if (intr_remapping_enabled) panic("BIOS bug: timer not connected to IO-APIC"); +#endif pin1 = pin2; apic1 = apic2; no_pin1 = 1; @@ -2305,7 +2320,7 @@ static inline void __init check_timer(void) setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); } unmask_IO_APIC_irq(0); - if (!no_timer_check && timer_irq_works()) { + if (timer_irq_works()) { if (nmi_watchdog == NMI_IO_APIC) { setup_nmi(); enable_8259A_irq(0); @@ -2314,8 +2329,10 @@ static inline void __init check_timer(void) clear_IO_APIC_pin(0, pin1); goto out; } +#ifdef CONFIG_INTR_REMAP if (intr_remapping_enabled) panic("timer doesn't work through Interrupt-remapped IO-APIC"); +#endif clear_IO_APIC_pin(apic1, pin1); if (!no_pin1) apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " @@ -2391,13 +2408,6 @@ out: local_irq_restore(flags); } -static int __init notimercheck(char *s) -{ - no_timer_check = 1; - return 1; -} -__setup("no_timer_check", notimercheck); - /* * Traditionally ISA IRQ2 is the cascade IRQ, and is not available * to devices. However there may be an I/O APIC pin available for -- cgit v1.2.3-70-g09d2 From d4057bdb6a3bb85dd44f9f39f41eac53696fd637 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:38 -0700 Subject: x86: make headers files the same in io_apic_xx.c also make no_timer_check to be global on 64 bit, because vmi_32 is using that. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_32.c | 15 ++++++++++++--- arch/x86/kernel/io_apic_64.c | 12 +++++++++--- 2 files changed, 21 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 3010bdd3352..48184e126f2 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -25,28 +25,37 @@ #include #include #include -#include +#include #include #include #include #include #include -#include #include #include #include #include -#include /* time_after() */ +#include /* time_after() */ +#ifdef CONFIG_ACPI +#include +#endif +#include +#include +#include #include #include #include +#include +#include +#include #include #include #include #include #include #include +#include #include #include diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 847aa798764..3c66e5a8e89 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -27,12 +27,15 @@ #include #include #include +#include #include +#include #include #include #include -#include -#include +#include +#include +#include /* time_after() */ #ifdef CONFIG_ACPI #include #endif @@ -46,14 +49,17 @@ #include #include #include +#include #include #include #include #include +#include #include #include #include +#include #define __apicdebuginit(type) static type __init @@ -1685,7 +1691,7 @@ void disable_IO_APIC(void) disconnect_bsp_APIC(ioapic_i8259.pin != -1); } -static int no_timer_check; +int no_timer_check __initdata; static int __init notimercheck(char *s) { -- cgit v1.2.3-70-g09d2 From f876d213a59c363d2492e399cc6c24edd6f3c368 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:39 -0700 Subject: x86: make 64 handle sis_apic_bug like the 32 bit do we have 64bit system with sis chipset? [ mingo@elte.hu: nope, the problem chipset was 32-bit only. The code symmetry is good nevertheless. ] Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_64.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 3c66e5a8e89..915af9b87aa 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -65,7 +65,11 @@ int ioapic_force; -int sis_apic_bug; /* not actually supported, dummy for compile */ +/* + * Is the SiS APIC rmw bug present ? + * -1 = don't know, 0 = no, 1 = yes + */ +int sis_apic_bug = -1; static DEFINE_SPINLOCK(ioapic_lock); static DEFINE_SPINLOCK(vector_lock); @@ -373,9 +377,11 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i * Re-write a value: to be used for read-modify-write * cycles where the read already set up the index register. */ -static inline void io_apic_modify(unsigned int apic, unsigned int value) +static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) { struct io_apic __iomem *io_apic = io_apic_base(apic); + if (sis_apic_bug) + writel(reg, &io_apic->index); writel(value, &io_apic->data); } @@ -494,7 +500,7 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) reg = io_apic_read(apic, 0x10 + pin*2); reg &= ~IO_APIC_REDIR_VECTOR_MASK; reg |= vector; - io_apic_modify(apic, reg); + io_apic_modify(apic, 0x10 + pin*2, reg); if (!entry->next) break; entry = entry->next; @@ -624,7 +630,7 @@ static inline void io_apic_sync(unsigned int apic) pin = entry->pin; \ reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ reg ACTION; \ - io_apic_modify(entry->apic, reg); \ + io_apic_modify(entry->apic, 0x10 + R + pin*2, reg); \ FINAL; \ if (!entry->next) \ break; \ @@ -2450,6 +2456,20 @@ void __init setup_IO_APIC(void) check_timer(); } +/* + * Called after all the initialization is done. If we didnt find any + * APIC bugs then we can allow the modify fast path + */ + +static int __init io_apic_bug_finalize(void) +{ + if (sis_apic_bug == -1) + sis_apic_bug = 0; + return 0; +} + +late_initcall(io_apic_bug_finalize); + struct sysfs_ioapic_data { struct sys_device dev; struct IO_APIC_route_entry entry[0]; -- cgit v1.2.3-70-g09d2 From aa45f97b1bb40adae1288669e73350907ffae85e Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:40 -0700 Subject: x86: remove ioapic_force no user left. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_64.c | 1 - arch/x86/kernel/io_apic_64.c | 2 -- include/asm-x86/apic.h | 2 -- 3 files changed, 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 4d7a188025b..cd860856a0b 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -1813,7 +1813,6 @@ static int __init apic_set_verbosity(char *arg) if (!arg) { #ifdef CONFIG_X86_64 skip_ioapic_setup = 0; - ioapic_force = 1; return 0; #endif return -EINVAL; diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 915af9b87aa..b70fd823218 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -63,8 +63,6 @@ #define __apicdebuginit(type) static type __init -int ioapic_force; - /* * Is the SiS APIC rmw bug present ? * -1 = don't know, 0 = no, 1 = yes diff --git a/include/asm-x86/apic.h b/include/asm-x86/apic.h index d76a0839abe..2d970f6bc2a 100644 --- a/include/asm-x86/apic.h +++ b/include/asm-x86/apic.h @@ -40,8 +40,6 @@ extern void generic_apic_probe(void); extern unsigned int apic_verbosity; extern int local_apic_timer_c2_ok; -extern int ioapic_force; - extern int disable_apic; /* * Basic functions accessing APICs. -- cgit v1.2.3-70-g09d2 From 047c8fdb8718890e3340073b178d0859d0c7f91f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:41 -0700 Subject: x86: make io_apic_64.c and io_apic_32.c the same all the same except INTR_REMAPPING related and ioapic io resource. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_32.c | 213 +++++++++++++- arch/x86/kernel/io_apic_64.c | 663 ++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 829 insertions(+), 47 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 48184e126f2..3ed36041c81 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -123,7 +123,6 @@ struct irq_cfg { u8 move_in_progress : 1; }; - /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ static struct irq_cfg irq_cfg_legacy[] __initdata = { [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, @@ -391,6 +390,38 @@ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned writel(value, &io_apic->data); } +#ifdef CONFIG_X86_64 +static bool io_apic_level_ack_pending(unsigned int irq) +{ + struct irq_pin_list *entry; + unsigned long flags; + struct irq_cfg *cfg = irq_cfg(irq); + + spin_lock_irqsave(&ioapic_lock, flags); + entry = cfg->irq_2_pin; + for (;;) { + unsigned int reg; + int pin; + + if (!entry) + break; + pin = entry->pin; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + /* Is the remote IRR bit set? */ + if (reg & IO_APIC_REDIR_REMOTE_IRR) { + spin_unlock_irqrestore(&ioapic_lock, flags); + return true; + } + if (!entry->next) + break; + entry = entry->next; + } + spin_unlock_irqrestore(&ioapic_lock, flags); + + return false; +} +#endif + union entry_union { struct { u32 w1, w2; }; struct IO_APIC_route_entry entry; @@ -483,17 +514,15 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) unsigned int dest; cpumask_t tmp; - cfg = irq_cfg(irq); - cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) return; + cfg = irq_cfg(irq); if (assign_irq_vector(irq, mask)) return; cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); /* * Only the high 8 bits are valid. @@ -572,6 +601,54 @@ static void __init replace_pin_at_irq(unsigned int irq, add_pin_to_irq(irq, newapic, newpin); } +#ifdef CONFIG_X86_64 +/* + * Synchronize the IO-APIC and the CPU by doing + * a dummy read from the IO-APIC + */ +static inline void io_apic_sync(unsigned int apic) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + readl(&io_apic->data); +} + +#define __DO_ACTION(R, ACTION, FINAL) \ + \ +{ \ + int pin; \ + struct irq_cfg *cfg; \ + struct irq_pin_list *entry; \ + \ + cfg = irq_cfg(irq); \ + entry = cfg->irq_2_pin; \ + for (;;) { \ + unsigned int reg; \ + if (!entry) \ + break; \ + pin = entry->pin; \ + reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ + reg ACTION; \ + io_apic_modify(entry->apic, 0x10 + R + pin*2, reg); \ + FINAL; \ + if (!entry->next) \ + break; \ + entry = entry->next; \ + } \ +} + +#define DO_ACTION(name,R,ACTION, FINAL) \ + \ + static void name##_IO_APIC_irq (unsigned int irq) \ + __DO_ACTION(R, ACTION, FINAL) + +/* mask = 1 */ +DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic)) + +/* mask = 0 */ +DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, ) + +#else + static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable) { struct irq_cfg *cfg; @@ -620,6 +697,8 @@ static void __unmask_and_level_IO_APIC_irq(unsigned int irq) IO_APIC_REDIR_MASKED); } +#endif + static void mask_IO_APIC_irq(unsigned int irq) { unsigned long flags; @@ -1055,6 +1134,17 @@ void unlock_vector_lock(void) static int __assign_irq_vector(int irq, cpumask_t mask) { + /* + * NOTE! The local APIC isn't very good at handling + * multiple interrupts at the same interrupt level. + * As the interrupt level is determined by taking the + * vector number and shifting that right by 4, we + * want to spread these out a bit so that they don't + * all fall in the same interrupt level. + * + * Also, we've got to be careful not to trash gate + * 0x80, because int 0x80 is hm, kind of importantish. ;) + */ static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; unsigned int old_vector; int cpu; @@ -1095,9 +1185,13 @@ next: } if (unlikely(current_vector == vector)) continue; - if (vector == SYSCALL_VECTOR) +#ifdef CONFIG_X86_64 + if (vector == IA32_SYSCALL_VECTOR) goto next; - +#else + if (vector == SYSCALL_VECTOR) + goto next; +#endif for_each_cpu_mask_nr(new_cpu, new_mask) if (per_cpu(vector_irq, new_cpu)[vector] != -1) goto next; @@ -1184,6 +1278,7 @@ static struct irq_chip ioapic_chip; #define IOAPIC_EDGE 0 #define IOAPIC_LEVEL 1 +#ifdef CONFIG_X86_32 static inline int IO_APIC_irq_trigger(int irq) { int apic, idx, pin; @@ -1200,6 +1295,12 @@ static inline int IO_APIC_irq_trigger(int irq) */ return 0; } +#else +static inline int IO_APIC_irq_trigger(int irq) +{ + return 1; +} +#endif static void ioapic_register_intr(int irq, unsigned long trigger) { @@ -1212,15 +1313,18 @@ static void ioapic_register_intr(int irq, unsigned long trigger) desc = irq_to_desc_alloc(irq); if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) { + trigger == IOAPIC_LEVEL) desc->status |= IRQ_LEVEL; + else + desc->status &= ~IRQ_LEVEL; + + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || + trigger == IOAPIC_LEVEL) set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_fasteoi_irq, "fasteoi"); - } else { - desc->status &= ~IRQ_LEVEL; + else set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_edge_irq, "edge"); - } } static int setup_ioapic_entry(int apic, int irq, @@ -1662,7 +1766,6 @@ static void __init enable_IO_APIC(void) struct IO_APIC_route_entry entry; entry = ioapic_read_entry(apic, pin); - /* If the interrupt line is enabled and in ExtInt mode * I have found the pin where the i8259 is connected. */ @@ -2012,6 +2115,60 @@ static void ack_apic_edge(unsigned int irq) ack_APIC_irq(); } +#ifdef CONFIG_X86_64 +static void ack_apic_level(unsigned int irq) +{ + int do_unmask_irq = 0; + + irq_complete_move(irq); +#ifdef CONFIG_GENERIC_PENDING_IRQ + /* If we are moving the irq we need to mask it */ + if (unlikely(desc->status & IRQ_MOVE_PENDING)) { + do_unmask_irq = 1; + mask_IO_APIC_irq(irq); + } +#endif + + /* + * We must acknowledge the irq before we move it or the acknowledge will + * not propagate properly. + */ + ack_APIC_irq(); + + /* Now we can move and renable the irq */ + if (unlikely(do_unmask_irq)) { + /* Only migrate the irq if the ack has been received. + * + * On rare occasions the broadcast level triggered ack gets + * delayed going to ioapics, and if we reprogram the + * vector while Remote IRR is still set the irq will never + * fire again. + * + * To prevent this scenario we read the Remote IRR bit + * of the ioapic. This has two effects. + * - On any sane system the read of the ioapic will + * flush writes (and acks) going to the ioapic from + * this cpu. + * - We get to see if the ACK has actually been delivered. + * + * Based on failed experiments of reprogramming the + * ioapic entry from outside of irq context starting + * with masking the ioapic entry and then polling until + * Remote IRR was clear before reprogramming the + * ioapic I don't trust the Remote IRR bit to be + * completey accurate. + * + * However there appears to be no other way to plug + * this race, so if the Remote IRR bit is not + * accurate and is causing problems then it is a hardware bug + * and you can go talk to the chipset vendor about it. + */ + if (!io_apic_level_ack_pending(irq)) + move_masked_irq(irq, desc); + unmask_IO_APIC_irq(irq); + } +} +#else atomic_t irq_mis_count; static void ack_apic_level(unsigned int irq) { @@ -2053,6 +2210,7 @@ static void ack_apic_level(unsigned int irq) spin_unlock(&ioapic_lock); } } +#endif static struct irq_chip ioapic_chip __read_mostly = { .name = "IO-APIC", @@ -2224,7 +2382,7 @@ static inline void __init unlock_ExtINT_logic(void) } static int disable_timer_pin_1 __initdata; - +/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ static int __init parse_disable_timer_pin_1(char *arg) { disable_timer_pin_1 = 1; @@ -2244,9 +2402,9 @@ static inline void __init check_timer(void) { struct irq_cfg *cfg = irq_cfg(0); int apic1, pin1, apic2, pin2; - int no_pin1 = 0; - unsigned int ver; unsigned long flags; + unsigned int ver; + int no_pin1 = 0; local_irq_save(flags); @@ -2550,6 +2708,7 @@ unsigned int create_irq_nr(unsigned int irq_want) cfg_new = irq_cfg(new); if (cfg_new && cfg_new->vector != 0) continue; + /* check if need to create one */ if (!cfg_new) cfg_new = irq_cfg_alloc(new); if (__assign_irq_vector(new, TARGET_CPUS) == 0) @@ -2720,6 +2879,32 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) return 0; } +int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +{ + unsigned int irq; + int ret, sub_handle; + struct msi_desc *desc; + unsigned int irq_want; + + irq_want = build_irq_for_pci_dev(dev) + 0x100; + sub_handle = 0; + list_for_each_entry(desc, &dev->msi_list, list) { + irq = create_irq_nr(irq_want--); + if (irq == 0) + return -1; + ret = setup_msi_irq(dev, desc, irq); + if (ret < 0) + goto error; + sub_handle++; + } + return 0; + +error: + destroy_irq(irq); + return ret; +} + + void arch_teardown_msi_irq(unsigned int irq) { destroy_irq(irq); diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index b70fd823218..940c4167b32 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -94,18 +94,22 @@ struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; /* # of MP IRQ source entries */ int mp_irq_entries; +#if defined (CONFIG_MCA) || defined (CONFIG_EISA) +int mp_bus_id_to_type[MAX_MP_BUSSES]; +#endif + DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); int skip_ioapic_setup; static int __init parse_noapic(char *str) { + /* disable IO-APIC */ disable_ioapic_setup(); return 0; } early_param("noapic", parse_noapic); - struct irq_cfg; struct irq_pin_list; struct irq_cfg { @@ -374,6 +378,8 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i /* * Re-write a value: to be used for read-modify-write * cycles where the read already set up the index register. + * + * Older SiS APIC requires we rewrite the index register */ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) { @@ -383,6 +389,7 @@ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned writel(value, &io_apic->data); } +#ifdef CONFIG_X86_64 static bool io_apic_level_ack_pending(unsigned int irq) { struct irq_pin_list *entry; @@ -412,6 +419,7 @@ static bool io_apic_level_ack_pending(unsigned int irq) return false; } +#endif union entry_union { struct { u32 w1, w2; }; @@ -509,7 +517,7 @@ static int assign_irq_vector(int irq, cpumask_t mask); static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) { - struct irq_cfg *cfg = irq_cfg(irq); + struct irq_cfg *cfg; unsigned long flags; unsigned int dest; cpumask_t tmp; @@ -519,12 +527,12 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) if (cpus_empty(tmp)) return; + cfg = irq_cfg(irq); if (assign_irq_vector(irq, mask)) return; cpus_and(tmp, cfg->domain, mask); dest = cpu_mask_to_apicid(tmp); - /* * Only the high 8 bits are valid. */ @@ -536,7 +544,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) desc->affinity = mask; spin_unlock_irqrestore(&ioapic_lock, flags); } -#endif +#endif /* CONFIG_SMP */ /* * The common case is 1:1 IRQ<->pin mappings. Sometimes there are @@ -602,6 +610,7 @@ static void __init replace_pin_at_irq(unsigned int irq, add_pin_to_irq(irq, newapic, newpin); } +#ifdef CONFIG_X86_64 /* * Synchronize the IO-APIC and the CPU by doing * a dummy read from the IO-APIC @@ -647,6 +656,58 @@ DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic)) /* mask = 0 */ DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, ) +#else + +static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable) +{ + struct irq_cfg *cfg; + struct irq_pin_list *entry; + unsigned int pin, reg; + + cfg = irq_cfg(irq); + entry = cfg->irq_2_pin; + for (;;) { + if (!entry) + break; + pin = entry->pin; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + reg &= ~disable; + reg |= enable; + io_apic_modify(entry->apic, 0x10 + pin*2, reg); + if (!entry->next) + break; + entry = entry->next; + } +} + +/* mask = 1 */ +static void __mask_IO_APIC_irq(unsigned int irq) +{ + __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0); +} + +/* mask = 0 */ +static void __unmask_IO_APIC_irq(unsigned int irq) +{ + __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED); +} + +/* mask = 1, trigger = 0 */ +static void __mask_and_edge_IO_APIC_irq(unsigned int irq) +{ + __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, + IO_APIC_REDIR_LEVEL_TRIGGER); +} + +/* mask = 0, trigger = 1 */ +static void __unmask_and_level_IO_APIC_irq(unsigned int irq) +{ + __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER, + IO_APIC_REDIR_MASKED); +} + +#endif + static void mask_IO_APIC_irq (unsigned int irq) { unsigned long flags; @@ -688,6 +749,64 @@ static void clear_IO_APIC (void) clear_IO_APIC_pin(apic, pin); } +#if !defined(CONFIG_SMP) && defined(CONFIG_X86_32) +void send_IPI_self(int vector) +{ + unsigned int cfg; + + /* + * Wait for idle. + */ + apic_wait_icr_idle(); + cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL; + /* + * Send the IPI. The write to APIC_ICR fires this off. + */ + apic_write(APIC_ICR, cfg); +} +#endif /* !CONFIG_SMP && CONFIG_X86_32*/ + +#ifdef CONFIG_X86_32 +/* + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to + * specific CPU-side IRQs. + */ + +#define MAX_PIRQS 8 +static int pirq_entries [MAX_PIRQS]; +static int pirqs_enabled; + +static int __init ioapic_pirq_setup(char *str) +{ + int i, max; + int ints[MAX_PIRQS+1]; + + get_options(str, ARRAY_SIZE(ints), ints); + + for (i = 0; i < MAX_PIRQS; i++) + pirq_entries[i] = -1; + + pirqs_enabled = 1; + apic_printk(APIC_VERBOSE, KERN_INFO + "PIRQ redirection, working around broken MP-BIOS.\n"); + max = MAX_PIRQS; + if (ints[0] < MAX_PIRQS) + max = ints[0]; + + for (i = 0; i < max; i++) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); + /* + * PIRQs are mapped upside down, usually. + */ + pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; + } + return 1; +} + +__setup("pirq=", ioapic_pirq_setup); +#endif /* CONFIG_X86_32 */ + #ifdef CONFIG_INTR_REMAP /* I/O APIC RTE contents at the OS boot up */ static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS]; @@ -861,18 +980,51 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) return best_guess; } +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); + +#if defined(CONFIG_EISA) || defined(CONFIG_MCA) +/* + * EISA Edge/Level control register, ELCR + */ +static int EISA_ELCR(unsigned int irq) +{ + if (irq < 16) { + unsigned int port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; + } + apic_printk(APIC_VERBOSE, KERN_INFO + "Broken MPtable reports ISA irq %d\n", irq); + return 0; +} + +#endif + /* ISA interrupts are always polarity zero edge triggered, * when listed as conforming in the MP table. */ #define default_ISA_trigger(idx) (0) #define default_ISA_polarity(idx) (0) +/* EISA interrupts are always polarity zero and can be edge or level + * trigger depending on the ELCR value. If an interrupt is listed as + * EISA conforming in the MP table, that means its trigger type must + * be read in from the ELCR */ + +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq)) +#define default_EISA_polarity(idx) default_ISA_polarity(idx) + /* PCI interrupts are always polarity one level triggered, * when listed as conforming in the MP table. */ #define default_PCI_trigger(idx) (1) #define default_PCI_polarity(idx) (1) +/* MCA interrupts are always polarity zero level triggered, + * when listed as conforming in the MP table. */ + +#define default_MCA_trigger(idx) (1) +#define default_MCA_polarity(idx) default_ISA_polarity(idx) + static int MPBIOS_polarity(int idx) { int bus = mp_irqs[idx].mp_srcbus; @@ -930,6 +1082,36 @@ static int MPBIOS_trigger(int idx) trigger = default_ISA_trigger(idx); else trigger = default_PCI_trigger(idx); +#if defined(CONFIG_EISA) || defined(CONFIG_MCA) + switch (mp_bus_id_to_type[bus]) { + case MP_BUS_ISA: /* ISA pin */ + { + /* set before the switch */ + break; + } + case MP_BUS_EISA: /* EISA pin */ + { + trigger = default_EISA_trigger(idx); + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + /* set before the switch */ + break; + } + case MP_BUS_MCA: /* MCA pin */ + { + trigger = default_MCA_trigger(idx); + break; + } + default: + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 1; + break; + } + } +#endif break; case 1: /* edge */ { @@ -967,6 +1149,7 @@ static inline int irq_trigger(int idx) return MPBIOS_trigger(idx); } +int (*ioapic_renumber_irq)(int ioapic, int irq); static int pin_2_irq(int idx, int apic, int pin) { int irq, i; @@ -988,7 +1171,32 @@ static int pin_2_irq(int idx, int apic, int pin) while (i < apic) irq += nr_ioapic_registers[i++]; irq += pin; + /* + * For MPS mode, so far only needed by ES7000 platform + */ + if (ioapic_renumber_irq) + irq = ioapic_renumber_irq(apic, irq); } + +#ifdef CONFIG_X86_32 + /* + * PCI IRQ command line redirection. Yes, limits are hardcoded. + */ + if ((pin >= 16) && (pin <= 23)) { + if (pirq_entries[pin-16] != -1) { + if (!pirq_entries[pin-16]) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + "disabling PIRQ%d\n", pin-16); + } else { + irq = pirq_entries[pin-16]; + apic_printk(APIC_VERBOSE, KERN_DEBUG + "using PIRQ%d -> IRQ %d\n", + pin-16, irq); + } + } + } +#endif + return irq; } @@ -1058,8 +1266,13 @@ next: } if (unlikely(current_vector == vector)) continue; +#ifdef CONFIG_X86_64 if (vector == IA32_SYSCALL_VECTOR) goto next; +#else + if (vector == SYSCALL_VECTOR) + goto next; +#endif for_each_cpu_mask_nr(new_cpu, new_mask) if (per_cpu(vector_irq, new_cpu)[vector] != -1) goto next; @@ -1140,6 +1353,34 @@ static struct irq_chip ioapic_chip; static struct irq_chip ir_ioapic_chip; #endif +#define IOAPIC_AUTO -1 +#define IOAPIC_EDGE 0 +#define IOAPIC_LEVEL 1 + +#ifdef CONFIG_X86_32 +static inline int IO_APIC_irq_trigger(int irq) +{ + int apic, idx, pin; + + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + idx = find_irq_entry(apic, pin, mp_INT); + if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) + return irq_trigger(idx); + } + } + /* + * nonexistent IRQs are edge default + */ + return 0; +} +#else +static inline int IO_APIC_irq_trigger(int irq) +{ + return 1; +} +#endif + static void ioapic_register_intr(int irq, unsigned long trigger) { struct irq_desc *desc; @@ -1150,7 +1391,8 @@ static void ioapic_register_intr(int irq, unsigned long trigger) else desc = irq_to_desc_alloc(irq); - if (trigger) + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || + trigger == IOAPIC_LEVEL) desc->status |= IRQ_LEVEL; else desc->status &= ~IRQ_LEVEL; @@ -1168,7 +1410,8 @@ static void ioapic_register_intr(int irq, unsigned long trigger) return; } #endif - if (trigger) + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || + trigger == IOAPIC_LEVEL) set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_fasteoi_irq, "fasteoi"); @@ -1303,6 +1546,10 @@ static void __init setup_IO_APIC_irqs(void) } irq = pin_2_irq(idx, apic, pin); +#ifdef CONFIG_X86_32 + if (multi_timer_check(apic, irq)) + continue; +#endif add_pin_to_irq(irq, apic, pin); setup_IO_APIC_irq(apic, pin, irq, @@ -1360,6 +1607,7 @@ __apicdebuginit(void) print_IO_APIC(void) union IO_APIC_reg_00 reg_00; union IO_APIC_reg_01 reg_01; union IO_APIC_reg_02 reg_02; + union IO_APIC_reg_03 reg_03; unsigned long flags; struct irq_cfg *cfg; @@ -1384,6 +1632,8 @@ __apicdebuginit(void) print_IO_APIC(void) reg_01.raw = io_apic_read(apic, 1); if (reg_01.bits.version >= 0x10) reg_02.raw = io_apic_read(apic, 2); + if (reg_01.bits.version >= 0x20) + reg_03.raw = io_apic_read(apic, 3); spin_unlock_irqrestore(&ioapic_lock, flags); printk("\n"); @@ -1399,11 +1649,27 @@ __apicdebuginit(void) print_IO_APIC(void) printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); - if (reg_01.bits.version >= 0x10) { + /* + * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, + * but the value of reg_02 is read as the previous read register + * value, so ignore it if reg_02 == reg_01. + */ + if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); } + /* + * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 + * or reg_03, but the value of reg_0[23] is read as the previous read + * register value, so ignore it if reg_03 == reg_0[12]. + */ + if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && + reg_03.raw != reg_01.raw) { + printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); + printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); + } + printk(KERN_DEBUG ".... IRQ redirection table:\n"); printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" @@ -1475,7 +1741,7 @@ __apicdebuginit(void) print_APIC_bitfield(int base) __apicdebuginit(void) print_local_APIC(void *dummy) { unsigned int v, ver, maxlvt; - unsigned long icr; + u64 icr; if (apic_verbosity == APIC_QUIET) return; @@ -1492,11 +1758,13 @@ __apicdebuginit(void) print_local_APIC(void *dummy) v = apic_read(APIC_TASKPRI); printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); - v = apic_read(APIC_ARBPRI); - printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, - v & APIC_ARBPRI_MASK); - v = apic_read(APIC_PROCPRI); - printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); + if (APIC_INTEGRATED(ver)) { /* !82489DX */ + v = apic_read(APIC_ARBPRI); + printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, + v & APIC_ARBPRI_MASK); + v = apic_read(APIC_PROCPRI); + printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); + } v = apic_read(APIC_EOI); printk(KERN_DEBUG "... APIC EOI: %08x\n", v); @@ -1516,8 +1784,13 @@ __apicdebuginit(void) print_local_APIC(void *dummy) printk(KERN_DEBUG "... APIC IRR field:\n"); print_APIC_bitfield(APIC_IRR); - v = apic_read(APIC_ESR); - printk(KERN_DEBUG "... APIC ESR: %08x\n", v); + if (APIC_INTEGRATED(ver)) { /* !82489DX */ + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + + v = apic_read(APIC_ESR); + printk(KERN_DEBUG "... APIC ESR: %08x\n", v); + } icr = apic_icr_read(); printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr); @@ -1608,6 +1881,13 @@ void __init enable_IO_APIC(void) int apic; unsigned long flags; +#ifdef CONFIG_X86_32 + int i; + if (!pirqs_enabled) + for (i = 0; i < MAX_PIRQS; i++) + pirq_entries[i] = -1; +#endif + /* * The number of IO-APIC IRQ registers (== #pins): */ @@ -1636,6 +1916,10 @@ void __init enable_IO_APIC(void) } found_i8259: /* Look to see what if the MP table has reported the ExtINT */ + /* If we could not find the appropriate pin by looking at the ioapic + * the i8259 probably is not connected the ioapic but give the + * mptable a chance anyway. + */ i8259_pin = find_isa_irq_pin(0, mp_ExtINT); i8259_apic = find_isa_irq_apic(0, mp_ExtINT); /* Trust the MP table if nothing is setup in the hardware */ @@ -1695,6 +1979,122 @@ void disable_IO_APIC(void) disconnect_bsp_APIC(ioapic_i8259.pin != -1); } +#ifdef CONFIG_X86_32 +/* + * function to set the IO-APIC physical IDs based on the + * values stored in the MPC table. + * + * by Matt Domsch Tue Dec 21 12:25:05 CST 1999 + */ + +static void __init setup_ioapic_ids_from_mpc(void) +{ + union IO_APIC_reg_00 reg_00; + physid_mask_t phys_id_present_map; + int apic; + int i; + unsigned char old_id; + unsigned long flags; + + if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids()) + return; + + /* + * Don't check I/O APIC IDs for xAPIC systems. They have + * no meaning without the serial APIC bus. + */ + if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + return; + /* + * This is broken; anything with a real cpu count has to + * circumvent this idiocy regardless. + */ + phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); + + /* + * Set the IOAPIC ID to the value stored in the MPC table. + */ + for (apic = 0; apic < nr_ioapics; apic++) { + + /* Read the register 0 value */ + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + old_id = mp_ioapics[apic].mp_apicid; + + if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) { + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", + apic, mp_ioapics[apic].mp_apicid); + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", + reg_00.bits.ID); + mp_ioapics[apic].mp_apicid = reg_00.bits.ID; + } + + /* + * Sanity check, is the ID really free? Every APIC in a + * system must have a unique ID or we get lots of nice + * 'stuck on smp_invalidate_needed IPI wait' messages. + */ + if (check_apicid_used(phys_id_present_map, + mp_ioapics[apic].mp_apicid)) { + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", + apic, mp_ioapics[apic].mp_apicid); + for (i = 0; i < get_physical_broadcast(); i++) + if (!physid_isset(i, phys_id_present_map)) + break; + if (i >= get_physical_broadcast()) + panic("Max APIC ID exceeded!\n"); + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", + i); + physid_set(i, phys_id_present_map); + mp_ioapics[apic].mp_apicid = i; + } else { + physid_mask_t tmp; + tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid); + apic_printk(APIC_VERBOSE, "Setting %d in the " + "phys_id_present_map\n", + mp_ioapics[apic].mp_apicid); + physids_or(phys_id_present_map, phys_id_present_map, tmp); + } + + + /* + * We need to adjust the IRQ routing table + * if the ID changed. + */ + if (old_id != mp_ioapics[apic].mp_apicid) + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].mp_dstapic == old_id) + mp_irqs[i].mp_dstapic + = mp_ioapics[apic].mp_apicid; + + /* + * Read the right value from the MPC table and + * write it into the ID register. + */ + apic_printk(APIC_VERBOSE, KERN_INFO + "...changing IO-APIC physical APIC ID to %d ...", + mp_ioapics[apic].mp_apicid); + + reg_00.bits.ID = mp_ioapics[apic].mp_apicid; + spin_lock_irqsave(&ioapic_lock, flags); + + /* + * Sanity check + */ + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid) + printk("could not set ID!\n"); + else + apic_printk(APIC_VERBOSE, " ok.\n"); + } +} +#endif + int no_timer_check __initdata; static int __init notimercheck(char *s) @@ -1780,8 +2180,10 @@ static unsigned int startup_ioapic_irq(unsigned int irq) return was_pending; } +#ifdef CONFIG_X86_64 static int ioapic_retrigger_irq(unsigned int irq) { + struct irq_cfg *cfg = irq_cfg(irq); unsigned long flags; @@ -1791,6 +2193,14 @@ static int ioapic_retrigger_irq(unsigned int irq) return 1; } +#else +static int ioapic_retrigger_irq(unsigned int irq) +{ + send_IPI_self(irq_cfg(irq)->vector); + + return 1; +} +#endif /* * Level and edge triggered IO-APIC interrupts need different handling, @@ -1952,7 +2362,9 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; ack_APIC_irq(); +#ifdef CONFIG_X86_64 exit_idle(); +#endif irq_enter(); me = smp_processor_id(); @@ -2024,6 +2436,7 @@ static void ack_apic_edge(unsigned int irq) ack_APIC_irq(); } +#ifdef CONFIG_X86_64 static void ack_apic_level(unsigned int irq) { int do_unmask_irq = 0; @@ -2076,6 +2489,49 @@ static void ack_apic_level(unsigned int irq) unmask_IO_APIC_irq(irq); } } +#else +atomic_t irq_mis_count; +static void ack_apic_level(unsigned int irq) +{ + unsigned long v; + int i; + + irq_complete_move(irq); + move_native_irq(irq); + /* + * It appears there is an erratum which affects at least version 0x11 + * of I/O APIC (that's the 82093AA and cores integrated into various + * chipsets). Under certain conditions a level-triggered interrupt is + * erroneously delivered as edge-triggered one but the respective IRR + * bit gets set nevertheless. As a result the I/O unit expects an EOI + * message but it will never arrive and further interrupts are blocked + * from the source. The exact reason is so far unknown, but the + * phenomenon was observed when two consecutive interrupt requests + * from a given source get delivered to the same CPU and the source is + * temporarily disabled in between. + * + * A workaround is to simulate an EOI message manually. We achieve it + * by setting the trigger mode to edge and then to level when the edge + * trigger mode gets detected in the TMR of a local APIC for a + * level-triggered interrupt. We mask the source for the time of the + * operation to prevent an edge-triggered interrupt escaping meanwhile. + * The idea is from Manfred Spraul. --macro + */ + i = irq_cfg(irq)->vector; + + v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); + + ack_APIC_irq(); + + if (!(v & (1 << (i & 0x1f)))) { + atomic_inc(&irq_mis_count); + spin_lock(&ioapic_lock); + __mask_and_edge_IO_APIC_irq(irq); + __unmask_and_level_IO_APIC_irq(irq); + spin_unlock(&ioapic_lock); + } +} +#endif static struct irq_chip ioapic_chip __read_mostly = { .name = "IO-APIC", @@ -2141,20 +2597,24 @@ static inline void init_IO_APIC_traps(void) } } -static void unmask_lapic_irq(unsigned int irq) +/* + * The local APIC irq-chip implementation: + */ + +static void mask_lapic_irq(unsigned int irq) { unsigned long v; v = apic_read(APIC_LVT0); - apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); + apic_write(APIC_LVT0, v | APIC_LVT_MASKED); } -static void mask_lapic_irq(unsigned int irq) +static void unmask_lapic_irq(unsigned int irq) { unsigned long v; v = apic_read(APIC_LVT0); - apic_write(APIC_LVT0, v | APIC_LVT_MASKED); + apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); } static void ack_lapic_irq (unsigned int irq) @@ -2182,19 +2642,19 @@ static void lapic_register_intr(int irq) static void __init setup_nmi(void) { /* - * Dirty trick to enable the NMI watchdog ... + * Dirty trick to enable the NMI watchdog ... * We put the 8259A master into AEOI mode and * unmask on all local APICs LVT0 as NMI. * * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') * is from Maciej W. Rozycki - so we do not have to EOI from * the NMI handler or the timer interrupt. - */ - printk(KERN_INFO "activating NMI Watchdog ..."); + */ + apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); enable_NMI_through_LVT0(); - printk(" done.\n"); + apic_printk(APIC_VERBOSE, " done.\n"); } /* @@ -2211,12 +2671,17 @@ static inline void __init unlock_ExtINT_logic(void) unsigned char save_control, save_freq_select; pin = find_isa_irq_pin(8, mp_INT); + if (pin == -1) { + WARN_ON_ONCE(1); + return; + } apic = find_isa_irq_apic(8, mp_INT); - if (pin == -1) + if (apic == -1) { + WARN_ON_ONCE(1); return; + } entry0 = ioapic_read_entry(apic, pin); - clear_IO_APIC_pin(apic, pin); memset(&entry1, 0, sizeof(entry1)); @@ -2268,17 +2733,21 @@ int timer_through_8259 __initdata; * is so screwy. Thanks to Brian Perkins for testing/hacking this beast * fanatically on his truly buggy board. * - * FIXME: really need to revamp this for modern platforms only. + * FIXME: really need to revamp this for all platforms. */ static inline void __init check_timer(void) { struct irq_cfg *cfg = irq_cfg(0); int apic1, pin1, apic2, pin2; unsigned long flags; + unsigned int ver; int no_pin1 = 0; local_irq_save(flags); + ver = apic_read(APIC_LVR); + ver = GET_APIC_VERSION(ver); + /* * get/set the timer IRQ vector: */ @@ -2287,10 +2756,18 @@ static inline void __init check_timer(void) /* * As IRQ0 is to be enabled in the 8259A, the virtual - * wire has to be disabled in the local APIC. + * wire has to be disabled in the local APIC. Also + * timer interrupts need to be acknowledged manually in + * the 8259A for the i82489DX when using the NMI + * watchdog as that APIC treats NMIs as level-triggered. + * The AEOI mode will finish them in the 8259A + * automatically. */ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); init_8259A(1); +#ifdef CONFIG_X86_32 + timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); +#endif pin1 = find_isa_irq_pin(0, mp_INT); apic1 = find_isa_irq_apic(0, mp_INT); @@ -2382,6 +2859,9 @@ static inline void __init check_timer(void) "through the IO-APIC - disabling NMI Watchdog!\n"); nmi_watchdog = NMI_NONE; } +#ifdef CONFIG_X86_32 + timer_ack = 0; +#endif apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...\n"); @@ -2435,19 +2915,29 @@ out: * the I/O APIC in all cases now. No actual device should request * it anyway. --macro */ -#define PIC_IRQS (1<<2) +#define PIC_IRQS (1 << PIC_CASCADE_IR) void __init setup_IO_APIC(void) { +#ifdef CONFIG_X86_32 + enable_IO_APIC(); +#else /* * calling enable_IO_APIC() is moved to setup_local_APIC for BP */ +#endif io_apic_irqs = ~PIC_IRQS; apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); - + /* + * Set up IO-APIC IRQ routing. + */ +#ifdef CONFIG_X86_32 + if (!acpi_ioapic) + setup_ioapic_ids_from_mpc(); +#endif sync_Arb_IDs(); setup_IO_APIC_irqs(); init_IO_APIC_traps(); @@ -3128,7 +3618,93 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) #ifdef CONFIG_ACPI -#define IO_APIC_MAX_ID 0xFE +#ifdef CONFIG_X86_32 +int __init io_apic_get_unique_id(int ioapic, int apic_id) +{ + union IO_APIC_reg_00 reg_00; + static physid_mask_t apic_id_map = PHYSID_MASK_NONE; + physid_mask_t tmp; + unsigned long flags; + int i = 0; + + /* + * The P4 platform supports up to 256 APIC IDs on two separate APIC + * buses (one for LAPICs, one for IOAPICs), where predecessors only + * supports up to 16 on one shared APIC bus. + * + * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full + * advantage of new APIC bus architecture. + */ + + if (physids_empty(apic_id_map)) + apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(ioapic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + if (apic_id >= get_physical_broadcast()) { + printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " + "%d\n", ioapic, apic_id, reg_00.bits.ID); + apic_id = reg_00.bits.ID; + } + + /* + * Every APIC in a system must have a unique ID or we get lots of nice + * 'stuck on smp_invalidate_needed IPI wait' messages. + */ + if (check_apicid_used(apic_id_map, apic_id)) { + + for (i = 0; i < get_physical_broadcast(); i++) { + if (!check_apicid_used(apic_id_map, i)) + break; + } + + if (i == get_physical_broadcast()) + panic("Max apic_id exceeded!\n"); + + printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " + "trying %d\n", ioapic, apic_id, i); + + apic_id = i; + } + + tmp = apicid_to_cpu_present(apic_id); + physids_or(apic_id_map, apic_id_map, tmp); + + if (reg_00.bits.ID != apic_id) { + reg_00.bits.ID = apic_id; + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(ioapic, 0, reg_00.raw); + reg_00.raw = io_apic_read(ioapic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + /* Sanity check */ + if (reg_00.bits.ID != apic_id) { + printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); + return -1; + } + } + + apic_printk(APIC_VERBOSE, KERN_INFO + "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); + + return apic_id; +} + +int __init io_apic_get_version(int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return reg_01.bits.version; +} +#endif int __init io_apic_get_redir_entries (int ioapic) { @@ -3226,6 +3802,7 @@ void __init setup_ioapic_dest(void) } #endif +#ifdef CONFIG_X86_64 #define IOAPIC_RESOURCE_NAME_SIZE 11 static struct resource *ioapic_resources; @@ -3261,36 +3838,56 @@ static struct resource * __init ioapic_setup_resources(void) return res; } +#endif void __init ioapic_init_mappings(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; - struct resource *ioapic_res; int i; +#ifdef CONFIG_X86_64 + struct resource *ioapic_res; ioapic_res = ioapic_setup_resources(); +#endif for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { ioapic_phys = mp_ioapics[i].mp_apicaddr; +#ifdef CONFIG_X86_32 + if (!ioapic_phys) { + printk(KERN_ERR + "WARNING: bogus zero IO-APIC " + "address found in MPTABLE, " + "disabling IO/APIC support!\n"); + smp_found_config = 0; + skip_ioapic_setup = 1; + goto fake_ioapic_page; + } +#endif } else { +#ifdef CONFIG_X86_32 +fake_ioapic_page: +#endif ioapic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); ioapic_phys = __pa(ioapic_phys); } set_fixmap_nocache(idx, ioapic_phys); apic_printk(APIC_VERBOSE, - "mapped IOAPIC to %016lx (%016lx)\n", + "mapped IOAPIC to %08lx (%08lx)\n", __fix_to_virt(idx), ioapic_phys); idx++; +#ifdef CONFIG_X86_64 if (ioapic_res != NULL) { ioapic_res->start = ioapic_phys; ioapic_res->end = ioapic_phys + (4 * 1024) - 1; ioapic_res++; } +#endif } } +#ifdef CONFIG_X86_64 static int __init ioapic_insert_resources(void) { int i; @@ -3313,4 +3910,4 @@ static int __init ioapic_insert_resources(void) /* Insert the IO APIC resources after PCI initialization has occured to handle * IO APICS that are mapped in on a BAR in PCI space. */ late_initcall(ioapic_insert_resources); - +#endif -- cgit v1.2.3-70-g09d2 From 54168ed7f2a4f3fc2780e645124ae952598da601 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 20 Aug 2008 09:07:45 +0200 Subject: x86: make io_apic_32.c the same as io_apic_64.c Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_32.c | 1521 ++++++++++++++++++++++++++++++------------ 1 file changed, 1104 insertions(+), 417 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 3ed36041c81..fba6d6ee348 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -35,7 +35,7 @@ #include #include #include -#include /* time_after() */ +#include /* time_after() */ #ifdef CONFIG_ACPI #include #endif @@ -64,8 +64,8 @@ #define __apicdebuginit(type) static type __init /* - * Is the SiS APIC rmw bug present ? - * -1 = don't know, 0 = no, 1 = yes + * Is the SiS APIC rmw bug present ? + * -1 = don't know, 0 = no, 1 = yes */ int sis_apic_bug = -1; @@ -102,7 +102,7 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); int skip_ioapic_setup; -static int __init parse_noapic(char *arg) +static int __init parse_noapic(char *str) { /* disable IO-APIC */ disable_ioapic_setup(); @@ -188,7 +188,7 @@ static void __init init_work(void *data) irq_cfgx[legacy_count - 1].next = NULL; } -#define for_each_irq_cfg(cfg) \ +#define for_each_irq_cfg(cfg) \ for (cfg = irq_cfgx; cfg; cfg = cfg->next) DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irq_cfg, PAGE_SIZE, init_work); @@ -262,7 +262,6 @@ static struct irq_cfg *irq_cfg_alloc(unsigned int irq) irq_cfgx = cfg; cfg->irq = irq; printk(KERN_DEBUG "found new irq_cfg for irq %d\n", cfg->irq); - #ifdef CONFIG_HAVE_SPARSE_IRQ_DEBUG { /* dump the results */ @@ -384,9 +383,9 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i */ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) { - volatile struct io_apic __iomem *io_apic = io_apic_base(apic); - if (sis_apic_bug) - writel(reg, &io_apic->index); + struct io_apic __iomem *io_apic = io_apic_base(apic); + if (sis_apic_bug) + writel(reg, &io_apic->index); writel(value, &io_apic->data); } @@ -494,11 +493,20 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) apic = entry->apic; pin = entry->pin; +#ifdef CONFIG_INTR_REMAP + /* + * With interrupt-remapping, destination information comes + * from interrupt-remapping table entry. + */ + if (!irq_remapped(irq)) + io_apic_write(apic, 0x11 + pin*2, dest); +#else io_apic_write(apic, 0x11 + pin*2, dest); +#endif reg = io_apic_read(apic, 0x10 + pin*2); reg &= ~IO_APIC_REDIR_VECTOR_MASK; reg |= vector; - io_apic_modify(apic, 0x10 + pin *2, reg); + io_apic_modify(apic, 0x10 + pin*2, reg); if (!entry->next) break; entry = entry->next; @@ -513,6 +521,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) unsigned long flags; unsigned int dest; cpumask_t tmp; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -529,12 +538,12 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) */ dest = SET_APIC_LOGICAL_ID(dest); + desc = irq_to_desc(irq); spin_lock_irqsave(&ioapic_lock, flags); __target_IO_APIC_irq(irq, dest, cfg->vector); - irq_to_desc(irq)->affinity = mask; + desc->affinity = mask; spin_unlock_irqrestore(&ioapic_lock, flags); } - #endif /* CONFIG_SMP */ /* @@ -699,7 +708,7 @@ static void __unmask_and_level_IO_APIC_irq(unsigned int irq) #endif -static void mask_IO_APIC_irq(unsigned int irq) +static void mask_IO_APIC_irq (unsigned int irq) { unsigned long flags; @@ -708,7 +717,7 @@ static void mask_IO_APIC_irq(unsigned int irq) spin_unlock_irqrestore(&ioapic_lock, flags); } -static void unmask_IO_APIC_irq(unsigned int irq) +static void unmask_IO_APIC_irq (unsigned int irq) { unsigned long flags; @@ -725,14 +734,13 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) entry = ioapic_read_entry(apic, pin); if (entry.delivery_mode == dest_SMI) return; - /* * Disable it in the IO-APIC irq-routing table: */ ioapic_mask_entry(apic, pin); } -static void clear_IO_APIC(void) +static void clear_IO_APIC (void) { int apic, pin; @@ -741,7 +749,7 @@ static void clear_IO_APIC(void) clear_IO_APIC_pin(apic, pin); } -#ifndef CONFIG_SMP +#if !defined(CONFIG_SMP) && defined(CONFIG_X86_32) void send_IPI_self(int vector) { unsigned int cfg; @@ -756,9 +764,9 @@ void send_IPI_self(int vector) */ apic_write(APIC_ICR, cfg); } -#endif /* !CONFIG_SMP */ - +#endif /* !CONFIG_SMP && CONFIG_X86_32*/ +#ifdef CONFIG_X86_32 /* * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to * specific CPU-side IRQs. @@ -797,6 +805,75 @@ static int __init ioapic_pirq_setup(char *str) } __setup("pirq=", ioapic_pirq_setup); +#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_INTR_REMAP +/* I/O APIC RTE contents at the OS boot up */ +static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS]; + +/* + * Saves and masks all the unmasked IO-APIC RTE's + */ +int save_mask_IO_APIC_setup(void) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + int apic, pin; + + /* + * The number of IO-APIC IRQ registers (== #pins): + */ + for (apic = 0; apic < nr_ioapics; apic++) { + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(apic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + nr_ioapic_registers[apic] = reg_01.bits.entries+1; + } + + for (apic = 0; apic < nr_ioapics; apic++) { + early_ioapic_entries[apic] = + kzalloc(sizeof(struct IO_APIC_route_entry) * + nr_ioapic_registers[apic], GFP_KERNEL); + if (!early_ioapic_entries[apic]) + return -ENOMEM; + } + + for (apic = 0; apic < nr_ioapics; apic++) + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + struct IO_APIC_route_entry entry; + + entry = early_ioapic_entries[apic][pin] = + ioapic_read_entry(apic, pin); + if (!entry.mask) { + entry.mask = 1; + ioapic_write_entry(apic, pin, entry); + } + } + return 0; +} + +void restore_IO_APIC_setup(void) +{ + int apic, pin; + + for (apic = 0; apic < nr_ioapics; apic++) + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) + ioapic_write_entry(apic, pin, + early_ioapic_entries[apic][pin]); +} + +void reinit_intr_remapped_IO_APIC(int intr_remapping) +{ + /* + * for now plain restore of previous settings. + * TBD: In the case of OS enabling interrupt-remapping, + * IO-APIC RTE's need to be setup to point to interrupt-remapping + * table entries. for now, do a plain restore, and wait for + * the setup_IO_APIC_irqs() to do proper initialization. + */ + restore_IO_APIC_setup(); +} +#endif /* * Find the IRQ entry number of a certain pin. @@ -848,7 +925,7 @@ static int __init find_isa_irq_apic(int irq, int type) } if (i < mp_irq_entries) { int apic; - for (apic = 0; apic < nr_ioapics; apic++) { + for(apic = 0; apic < nr_ioapics; apic++) { if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic) return apic; } @@ -867,10 +944,10 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) { int apic, i, best_guess = -1; - apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, " - "slot:%d, pin:%d.\n", bus, slot, pin); + apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", + bus, slot, pin); if (test_bit(bus, mp_bus_not_pci)) { - printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus); + apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); return -1; } for (i = 0; i < mp_irq_entries; i++) { @@ -885,7 +962,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) !mp_irqs[i].mp_irqtype && (bus == lbus) && (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) { - int irq = pin_2_irq(i, apic, mp_irqs[i].mp_dstirq); + int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq); if (!(apic || IO_APIC_IRQ(irq))) continue; @@ -902,6 +979,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) } return best_guess; } + EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); #if defined(CONFIG_EISA) || defined(CONFIG_MCA) @@ -918,6 +996,7 @@ static int EISA_ELCR(unsigned int irq) "Broken MPtable reports ISA irq %d\n", irq); return 0; } + #endif /* ISA interrupts are always polarity zero edge triggered, @@ -954,36 +1033,36 @@ static int MPBIOS_polarity(int idx) /* * Determine IRQ line polarity (high active or low active): */ - switch (mp_irqs[idx].mp_irqflag & 3) { - case 0: /* conforms, ie. bus-type dependent polarity */ - { - polarity = test_bit(bus, mp_bus_not_pci)? - default_ISA_polarity(idx): - default_PCI_polarity(idx); - break; - } - case 1: /* high active */ - { - polarity = 0; - break; - } - case 2: /* reserved */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - case 3: /* low active */ - { - polarity = 1; - break; - } - default: /* invalid */ + switch (mp_irqs[idx].mp_irqflag & 3) { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } + case 0: /* conforms, ie. bus-type dependent polarity */ + if (test_bit(bus, mp_bus_not_pci)) + polarity = default_ISA_polarity(idx); + else + polarity = default_PCI_polarity(idx); + break; + case 1: /* high active */ + { + polarity = 0; + break; + } + case 2: /* reserved */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + case 3: /* low active */ + { + polarity = 1; + break; + } + default: /* invalid */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } } return polarity; } @@ -996,67 +1075,67 @@ static int MPBIOS_trigger(int idx) /* * Determine IRQ trigger mode (edge or level sensitive): */ - switch ((mp_irqs[idx].mp_irqflag>>2) & 3) { - case 0: /* conforms, ie. bus-type dependent */ + switch ((mp_irqs[idx].mp_irqflag>>2) & 3) { - trigger = test_bit(bus, mp_bus_not_pci)? - default_ISA_trigger(idx): - default_PCI_trigger(idx); + case 0: /* conforms, ie. bus-type dependent */ + if (test_bit(bus, mp_bus_not_pci)) + trigger = default_ISA_trigger(idx); + else + trigger = default_PCI_trigger(idx); #if defined(CONFIG_EISA) || defined(CONFIG_MCA) - switch (mp_bus_id_to_type[bus]) { - case MP_BUS_ISA: /* ISA pin */ - { - /* set before the switch */ + switch (mp_bus_id_to_type[bus]) { + case MP_BUS_ISA: /* ISA pin */ + { + /* set before the switch */ + break; + } + case MP_BUS_EISA: /* EISA pin */ + { + trigger = default_EISA_trigger(idx); + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + /* set before the switch */ + break; + } + case MP_BUS_MCA: /* MCA pin */ + { + trigger = default_MCA_trigger(idx); + break; + } + default: + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 1; + break; + } + } +#endif break; - } - case MP_BUS_EISA: /* EISA pin */ + case 1: /* edge */ { - trigger = default_EISA_trigger(idx); + trigger = 0; break; } - case MP_BUS_PCI: /* PCI pin */ + case 2: /* reserved */ { - /* set before the switch */ + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 1; break; } - case MP_BUS_MCA: /* MCA pin */ + case 3: /* level */ { - trigger = default_MCA_trigger(idx); + trigger = 1; break; } - default: + default: /* invalid */ { printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 1; + trigger = 0; break; } } -#endif - break; - } - case 1: /* edge */ - { - trigger = 0; - break; - } - case 2: /* reserved */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 1; - break; - } - case 3: /* level */ - { - trigger = 1; - break; - } - default: /* invalid */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 0; - break; - } - } return trigger; } @@ -1082,9 +1161,9 @@ static int pin_2_irq(int idx, int apic, int pin) if (mp_irqs[idx].mp_dstirq != pin) printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); - if (test_bit(bus, mp_bus_not_pci)) + if (test_bit(bus, mp_bus_not_pci)) { irq = mp_irqs[idx].mp_srcbusirq; - else { + } else { /* * PCI IRQs are mapped in order */ @@ -1092,14 +1171,14 @@ static int pin_2_irq(int idx, int apic, int pin) while (i < apic) irq += nr_ioapic_registers[i++]; irq += pin; - - /* - * For MPS mode, so far only needed by ES7000 platform - */ - if (ioapic_renumber_irq) - irq = ioapic_renumber_irq(apic, irq); + /* + * For MPS mode, so far only needed by ES7000 platform + */ + if (ioapic_renumber_irq) + irq = ioapic_renumber_irq(apic, irq); } +#ifdef CONFIG_X86_32 /* * PCI IRQ command line redirection. Yes, limits are hardcoded. */ @@ -1116,6 +1195,8 @@ static int pin_2_irq(int idx, int apic, int pin) } } } +#endif + return irq; } @@ -1145,74 +1226,70 @@ static int __assign_irq_vector(int irq, cpumask_t mask) * Also, we've got to be careful not to trash gate * 0x80, because int 0x80 is hm, kind of importantish. ;) */ - static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; - unsigned int old_vector; - int cpu; - struct irq_cfg *cfg; + static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; + unsigned int old_vector; + int cpu; + struct irq_cfg *cfg; - cfg = irq_cfg(irq); + cfg = irq_cfg(irq); - /* Only try and allocate irqs on cpus that are present */ - cpus_and(mask, mask, cpu_online_map); + /* Only try and allocate irqs on cpus that are present */ + cpus_and(mask, mask, cpu_online_map); - if ((cfg->move_in_progress) || cfg->move_cleanup_count) - return -EBUSY; + if ((cfg->move_in_progress) || cfg->move_cleanup_count) + return -EBUSY; - old_vector = cfg->vector; - if (old_vector) { - cpumask_t tmp; - cpus_and(tmp, cfg->domain, mask); - if (!cpus_empty(tmp)) - return 0; - } + old_vector = cfg->vector; + if (old_vector) { + cpumask_t tmp; + cpus_and(tmp, cfg->domain, mask); + if (!cpus_empty(tmp)) + return 0; + } - for_each_cpu_mask_nr(cpu, mask) { - cpumask_t domain, new_mask; - int new_cpu; - int vector, offset; + for_each_cpu_mask_nr(cpu, mask) { + cpumask_t domain, new_mask; + int new_cpu; + int vector, offset; - domain = vector_allocation_domain(cpu); - cpus_and(new_mask, domain, cpu_online_map); + domain = vector_allocation_domain(cpu); + cpus_and(new_mask, domain, cpu_online_map); - vector = current_vector; - offset = current_offset; + vector = current_vector; + offset = current_offset; next: - vector += 8; - if (vector >= first_system_vector) { - /* If we run out of vectors on large boxen, must share them. */ - offset = (offset + 1) % 8; - vector = FIRST_DEVICE_VECTOR + offset; - } - if (unlikely(current_vector == vector)) - continue; + vector += 8; + if (vector >= first_system_vector) { + /* If we run out of vectors on large boxen, must share them. */ + offset = (offset + 1) % 8; + vector = FIRST_DEVICE_VECTOR + offset; + } + if (unlikely(current_vector == vector)) + continue; #ifdef CONFIG_X86_64 - if (vector == IA32_SYSCALL_VECTOR) - goto next; + if (vector == IA32_SYSCALL_VECTOR) + goto next; #else - if (vector == SYSCALL_VECTOR) - goto next; + if (vector == SYSCALL_VECTOR) + goto next; #endif - for_each_cpu_mask_nr(new_cpu, new_mask) - if (per_cpu(vector_irq, new_cpu)[vector] != -1) - goto next; - /* Found one! */ - current_vector = vector; - current_offset = offset; - if (old_vector) { - cfg->move_in_progress = 1; - cfg->old_domain = cfg->domain; - } - printk(KERN_DEBUG "assign_irq_vector: irq %d vector %#x cpu ", irq, vector); - for_each_cpu_mask_nr(new_cpu, new_mask) { - per_cpu(vector_irq, new_cpu)[vector] = irq; - printk(KERN_CONT " %d ", new_cpu); + for_each_cpu_mask_nr(new_cpu, new_mask) + if (per_cpu(vector_irq, new_cpu)[vector] != -1) + goto next; + /* Found one! */ + current_vector = vector; + current_offset = offset; + if (old_vector) { + cfg->move_in_progress = 1; + cfg->old_domain = cfg->domain; } - printk(KERN_CONT "\n"); - cfg->vector = vector; - cfg->domain = domain; - return 0; - } - return -ENOSPC; + for_each_cpu_mask_nr(new_cpu, new_mask) + per_cpu(vector_irq, new_cpu)[vector] = irq; + cfg->vector = vector; + cfg->domain = domain; + return 0; + } + return -ENOSPC; } static int assign_irq_vector(int irq, cpumask_t mask) @@ -1223,7 +1300,6 @@ static int assign_irq_vector(int irq, cpumask_t mask) spin_lock_irqsave(&vector_lock, flags); err = __assign_irq_vector(irq, mask); spin_unlock_irqrestore(&vector_lock, flags); - return err; } @@ -1269,36 +1345,39 @@ void __setup_vector_irq(int cpu) cfg = irq_cfg(irq); if (!cpu_isset(cpu, cfg->domain)) per_cpu(vector_irq, cpu)[vector] = -1; - } + } } static struct irq_chip ioapic_chip; +#ifdef CONFIG_INTR_REMAP +static struct irq_chip ir_ioapic_chip; +#endif -#define IOAPIC_AUTO -1 -#define IOAPIC_EDGE 0 -#define IOAPIC_LEVEL 1 +#define IOAPIC_AUTO -1 +#define IOAPIC_EDGE 0 +#define IOAPIC_LEVEL 1 #ifdef CONFIG_X86_32 static inline int IO_APIC_irq_trigger(int irq) { - int apic, idx, pin; + int apic, idx, pin; - for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - idx = find_irq_entry(apic, pin, mp_INT); - if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) - return irq_trigger(idx); - } - } - /* - * nonexistent IRQs are edge default - */ - return 0; + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + idx = find_irq_entry(apic, pin, mp_INT); + if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) + return irq_trigger(idx); + } + } + /* + * nonexistent IRQs are edge default + */ + return 0; } #else static inline int IO_APIC_irq_trigger(int irq) { - return 1; + return 1; } #endif @@ -1318,13 +1397,27 @@ static void ioapic_register_intr(int irq, unsigned long trigger) else desc->status &= ~IRQ_LEVEL; +#ifdef CONFIG_INTR_REMAP + if (irq_remapped(irq)) { + desc->status |= IRQ_MOVE_PCNTXT; + if (trigger) + set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, + handle_fasteoi_irq, + "fasteoi"); + else + set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, + handle_edge_irq, "edge"); + return; + } +#endif if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || trigger == IOAPIC_LEVEL) set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_fasteoi_irq, "fasteoi"); + handle_fasteoi_irq, + "fasteoi"); else set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_edge_irq, "edge"); + handle_edge_irq, "edge"); } static int setup_ioapic_entry(int apic, int irq, @@ -1337,11 +1430,45 @@ static int setup_ioapic_entry(int apic, int irq, */ memset(entry,0,sizeof(*entry)); - entry->delivery_mode = INT_DELIVERY_MODE; - entry->dest_mode = INT_DEST_MODE; - entry->dest = destination; +#ifdef CONFIG_INTR_REMAP + if (intr_remapping_enabled) { + struct intel_iommu *iommu = map_ioapic_to_ir(apic); + struct irte irte; + struct IR_IO_APIC_route_entry *ir_entry = + (struct IR_IO_APIC_route_entry *) entry; + int index; + + if (!iommu) + panic("No mapping iommu for ioapic %d\n", apic); + + index = alloc_irte(iommu, irq, 1); + if (index < 0) + panic("Failed to allocate IRTE for ioapic %d\n", apic); + + memset(&irte, 0, sizeof(irte)); + + irte.present = 1; + irte.dst_mode = INT_DEST_MODE; + irte.trigger_mode = trigger; + irte.dlvry_mode = INT_DELIVERY_MODE; + irte.vector = vector; + irte.dest_id = IRTE_DEST(destination); + + modify_irte(irq, &irte); + + ir_entry->index2 = (index >> 15) & 0x1; + ir_entry->zero = 0; + ir_entry->format = 1; + ir_entry->index = (index & 0x7fff); + } else +#endif + { + entry->delivery_mode = INT_DELIVERY_MODE; + entry->dest_mode = INT_DEST_MODE; + entry->dest = destination; + } - entry->mask = 0; /* enable IRQ */ + entry->mask = 0; /* enable IRQ */ entry->trigger = trigger; entry->polarity = polarity; entry->vector = vector; @@ -1351,12 +1478,11 @@ static int setup_ioapic_entry(int apic, int irq, */ if (trigger) entry->mask = 1; - return 0; } static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, - int trigger, int polarity) + int trigger, int polarity) { struct irq_cfg *cfg; struct IO_APIC_route_entry entry; @@ -1420,10 +1546,10 @@ static void __init setup_IO_APIC_irqs(void) } irq = pin_2_irq(idx, apic, pin); - +#ifdef CONFIG_X86_32 if (multi_timer_check(apic, irq)) continue; - +#endif add_pin_to_irq(irq, apic, pin); setup_IO_APIC_irq(apic, pin, irq, @@ -1443,6 +1569,11 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, { struct IO_APIC_route_entry entry; +#ifdef CONFIG_INTR_REMAP + if (intr_remapping_enabled) + return; +#endif + memset(&entry, 0, sizeof(entry)); /* @@ -1461,7 +1592,7 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, * The timer IRQ doesn't have to know that behind the * scene we may have a 8259A-master in AEOI mode ... */ - ioapic_register_intr(0, IOAPIC_EDGE); + set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); /* * Add it to the IO-APIC irq-routing table: @@ -1501,17 +1632,18 @@ __apicdebuginit(void) print_IO_APIC(void) reg_01.raw = io_apic_read(apic, 1); if (reg_01.bits.version >= 0x10) reg_02.raw = io_apic_read(apic, 2); - if (reg_01.bits.version >= 0x20) - reg_03.raw = io_apic_read(apic, 3); + if (reg_01.bits.version >= 0x20) + reg_03.raw = io_apic_read(apic, 3); spin_unlock_irqrestore(&ioapic_lock, flags); + printk("\n"); printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); - printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw); + printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); @@ -1548,7 +1680,10 @@ __apicdebuginit(void) print_IO_APIC(void) entry = ioapic_read_entry(apic, i); - printk(KERN_DEBUG " %02x %02X ", i, entry.dest); + printk(KERN_DEBUG " %02x %03X ", + i, + entry.dest + ); printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", entry.mask, @@ -1567,7 +1702,7 @@ __apicdebuginit(void) print_IO_APIC(void) struct irq_pin_list *entry = cfg->irq_2_pin; if (!entry) continue; - printk(KERN_DEBUG "IRQ%d ", i); + printk(KERN_DEBUG "IRQ%d ", cfg->irq); for (;;) { printk("-> %d:%d", entry->apic, entry->pin); if (!entry->next) @@ -1614,8 +1749,7 @@ __apicdebuginit(void) print_local_APIC(void *dummy) printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", smp_processor_id(), hard_smp_processor_id()); v = apic_read(APIC_ID); - printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, - GET_APIC_ID(v)); + printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id()); v = apic_read(APIC_LVR); printk(KERN_INFO "... APIC VERSION: %08x\n", v); ver = GET_APIC_VERSION(v); @@ -1624,7 +1758,7 @@ __apicdebuginit(void) print_local_APIC(void *dummy) v = apic_read(APIC_TASKPRI); printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); - if (APIC_INTEGRATED(ver)) { /* !82489DX */ + if (APIC_INTEGRATED(ver)) { /* !82489DX */ v = apic_read(APIC_ARBPRI); printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, v & APIC_ARBPRI_MASK); @@ -1650,9 +1784,10 @@ __apicdebuginit(void) print_local_APIC(void *dummy) printk(KERN_DEBUG "... APIC IRR field:\n"); print_APIC_bitfield(APIC_IRR); - if (APIC_INTEGRATED(ver)) { /* !82489DX */ - if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + if (APIC_INTEGRATED(ver)) { /* !82489DX */ + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); + v = apic_read(APIC_ESR); printk(KERN_DEBUG "... APIC ESR: %08x\n", v); } @@ -1710,11 +1845,11 @@ __apicdebuginit(void) print_PIC(void) v = inb(0xa0) << 8 | inb(0x20); printk(KERN_DEBUG "... PIC IRR: %04x\n", v); - outb(0x0b, 0xa0); - outb(0x0b, 0x20); + outb(0x0b,0xa0); + outb(0x0b,0x20); v = inb(0xa0) << 8 | inb(0x20); - outb(0x0a, 0xa0); - outb(0x0a, 0x20); + outb(0x0a,0xa0); + outb(0x0a,0x20); spin_unlock_irqrestore(&i8259A_lock, flags); @@ -1739,16 +1874,19 @@ fs_initcall(print_all_ICs); /* Where if anywhere is the i8259 connect in external int mode */ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; -static void __init enable_IO_APIC(void) +void __init enable_IO_APIC(void) { union IO_APIC_reg_01 reg_01; int i8259_apic, i8259_pin; - int i, apic; + int apic; unsigned long flags; +#ifdef CONFIG_X86_32 + int i; if (!pirqs_enabled) for (i = 0; i < MAX_PIRQS; i++) pirq_entries[i] = -1; +#endif /* * The number of IO-APIC IRQ registers (== #pins): @@ -1759,7 +1897,7 @@ static void __init enable_IO_APIC(void) spin_unlock_irqrestore(&ioapic_lock, flags); nr_ioapic_registers[apic] = reg_01.bits.entries+1; } - for (apic = 0; apic < nr_ioapics; apic++) { + for(apic = 0; apic < nr_ioapics; apic++) { int pin; /* See if any of the pins is in ExtINT mode */ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { @@ -1830,16 +1968,18 @@ void disable_IO_APIC(void) entry.dest_mode = 0; /* Physical */ entry.delivery_mode = dest_ExtINT; /* ExtInt */ entry.vector = 0; - entry.dest = read_apic_id(); + entry.dest = read_apic_id(); /* * Add it to the IO-APIC irq-routing table: */ ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); } + disconnect_bsp_APIC(ioapic_i8259.pin != -1); } +#ifdef CONFIG_X86_32 /* * function to set the IO-APIC physical IDs based on the * values stored in the MPC table. @@ -1940,8 +2080,6 @@ static void __init setup_ioapic_ids_from_mpc(void) reg_00.bits.ID = mp_ioapics[apic].mp_apicid; spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0, reg_00.raw); - spin_unlock_irqrestore(&ioapic_lock, flags); /* * Sanity check @@ -1955,6 +2093,7 @@ static void __init setup_ioapic_ids_from_mpc(void) apic_printk(APIC_VERBOSE, " ok.\n"); } } +#endif int no_timer_check __initdata; @@ -1994,9 +2133,10 @@ static int __init timer_irq_works(void) * might have cached one ExtINT interrupt. Finally, at * least one tick may be lost due to delays. */ + + /* jiffies wrap? */ if (time_after(jiffies, t1 + 4)) return 1; - return 0; } @@ -2014,8 +2154,6 @@ static int __init timer_irq_works(void) */ /* - * Startup quirk: - * * Starting up a edge-triggered IO-APIC interrupt is * nasty - we need to make sure that we get the edge. * If it is already asserted for some reason, we need @@ -2023,9 +2161,8 @@ static int __init timer_irq_works(void) * * This is not complete - we should be able to fake * an edge even if it isn't on the 8259A... - * - * (We do this for level-triggered IRQs too - it cannot hurt.) */ + static unsigned int startup_ioapic_irq(unsigned int irq) { int was_pending = 0; @@ -2043,70 +2180,254 @@ static unsigned int startup_ioapic_irq(unsigned int irq) return was_pending; } +#ifdef CONFIG_X86_64 static int ioapic_retrigger_irq(unsigned int irq) { - send_IPI_self(irq_cfg(irq)->vector); + + struct irq_cfg *cfg = irq_cfg(irq); + unsigned long flags; + + spin_lock_irqsave(&vector_lock, flags); + send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector); + spin_unlock_irqrestore(&vector_lock, flags); return 1; } - -#ifdef CONFIG_SMP -asmlinkage void smp_irq_move_cleanup_interrupt(void) +#else +static int ioapic_retrigger_irq(unsigned int irq) { - unsigned vector, me; - ack_APIC_irq(); - irq_enter(); - - me = smp_processor_id(); - for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { - unsigned int irq; - struct irq_desc *desc; - struct irq_cfg *cfg; - irq = __get_cpu_var(vector_irq)[vector]; + send_IPI_self(irq_cfg(irq)->vector); - desc = irq_to_desc(irq); - if (!desc) - continue; + return 1; +} +#endif - cfg = irq_cfg(irq); - spin_lock(&desc->lock); - if (!cfg->move_cleanup_count) - goto unlock; +/* + * Level and edge triggered IO-APIC interrupts need different handling, + * so we use two separate IRQ descriptors. Edge triggered IRQs can be + * handled with the level-triggered descriptor, but that one has slightly + * more overhead. Level-triggered interrupts cannot be handled with the + * edge-triggered handler, without risking IRQ storms and other ugly + * races. + */ - if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) - goto unlock; +#ifdef CONFIG_SMP - __get_cpu_var(vector_irq)[vector] = -1; - cfg->move_cleanup_count--; -unlock: - spin_unlock(&desc->lock); - } +#ifdef CONFIG_INTR_REMAP +static void ir_irq_migration(struct work_struct *work); - irq_exit(); -} +static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration); -static void irq_complete_move(unsigned int irq) +/* + * Migrate the IO-APIC irq in the presence of intr-remapping. + * + * For edge triggered, irq migration is a simple atomic update(of vector + * and cpu destination) of IRTE and flush the hardware cache. + * + * For level triggered, we need to modify the io-apic RTE aswell with the update + * vector information, along with modifying IRTE with vector and destination. + * So irq migration for level triggered is little bit more complex compared to + * edge triggered migration. But the good news is, we use the same algorithm + * for level triggered migration as we have today, only difference being, + * we now initiate the irq migration from process context instead of the + * interrupt context. + * + * In future, when we do a directed EOI (combined with cpu EOI broadcast + * suppression) to the IO-APIC, level triggered irq migration will also be + * as simple as edge triggered migration and we can do the irq migration + * with a simple atomic update to IO-APIC RTE. + */ +static void migrate_ioapic_irq(int irq, cpumask_t mask) { - struct irq_cfg *cfg = irq_cfg(irq); - unsigned vector, me; + struct irq_cfg *cfg; + struct irq_desc *desc; + cpumask_t tmp, cleanup_mask; + struct irte irte; + int modify_ioapic_rte; + unsigned int dest; + unsigned long flags; - if (likely(!cfg->move_in_progress)) + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) return; - vector = ~get_irq_regs()->orig_ax; - me = smp_processor_id(); - if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { - cpumask_t cleanup_mask; + if (get_irte(irq, &irte)) + return; - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); - cfg->move_cleanup_count = cpus_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + if (assign_irq_vector(irq, mask)) + return; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + desc = irq_to_desc(irq); + modify_ioapic_rte = desc->status & IRQ_LEVEL; + if (modify_ioapic_rte) { + spin_lock_irqsave(&ioapic_lock, flags); + __target_IO_APIC_irq(irq, dest, cfg->vector); + spin_unlock_irqrestore(&ioapic_lock, flags); + } + + irte.vector = cfg->vector; + irte.dest_id = IRTE_DEST(dest); + + /* + * Modified the IRTE and flushes the Interrupt entry cache. + */ + modify_irte(irq, &irte); + + if (cfg->move_in_progress) { + cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); + cfg->move_cleanup_count = cpus_weight(cleanup_mask); + send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + cfg->move_in_progress = 0; + } + + desc->affinity = mask; +} + +static int migrate_irq_remapped_level(int irq) +{ + int ret = -1; + struct irq_desc *desc = irq_to_desc(irq); + + mask_IO_APIC_irq(irq); + + if (io_apic_level_ack_pending(irq)) { + /* + * Interrupt in progress. Migrating irq now will change the + * vector information in the IO-APIC RTE and that will confuse + * the EOI broadcast performed by cpu. + * So, delay the irq migration to the next instance. + */ + schedule_delayed_work(&ir_migration_work, 1); + goto unmask; + } + + /* everthing is clear. we have right of way */ + migrate_ioapic_irq(irq, desc->pending_mask); + + ret = 0; + desc->status &= ~IRQ_MOVE_PENDING; + cpus_clear(desc->pending_mask); + +unmask: + unmask_IO_APIC_irq(irq); + return ret; +} + +static void ir_irq_migration(struct work_struct *work) +{ + unsigned int irq; + struct irq_desc *desc; + + for_each_irq_desc(irq, desc) { + if (desc->status & IRQ_MOVE_PENDING) { + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + if (!desc->chip->set_affinity || + !(desc->status & IRQ_MOVE_PENDING)) { + desc->status &= ~IRQ_MOVE_PENDING; + spin_unlock_irqrestore(&desc->lock, flags); + continue; + } + + desc->chip->set_affinity(irq, desc->pending_mask); + spin_unlock_irqrestore(&desc->lock, flags); + } + } +} + +/* + * Migrates the IRQ destination in the process context. + */ +static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (desc->status & IRQ_LEVEL) { + desc->status |= IRQ_MOVE_PENDING; + desc->pending_mask = mask; + migrate_irq_remapped_level(irq); + return; + } + + migrate_ioapic_irq(irq, mask); +} +#endif + +asmlinkage void smp_irq_move_cleanup_interrupt(void) +{ + unsigned vector, me; + ack_APIC_irq(); +#ifdef CONFIG_X86_64 + exit_idle(); +#endif + irq_enter(); + + me = smp_processor_id(); + for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { + unsigned int irq; + struct irq_desc *desc; + struct irq_cfg *cfg; + irq = __get_cpu_var(vector_irq)[vector]; + + desc = irq_to_desc(irq); + if (!desc) + continue; + + cfg = irq_cfg(irq); + spin_lock(&desc->lock); + if (!cfg->move_cleanup_count) + goto unlock; + + if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) + goto unlock; + + __get_cpu_var(vector_irq)[vector] = -1; + cfg->move_cleanup_count--; +unlock: + spin_unlock(&desc->lock); + } + + irq_exit(); +} + +static void irq_complete_move(unsigned int irq) +{ + struct irq_cfg *cfg = irq_cfg(irq); + unsigned vector, me; + + if (likely(!cfg->move_in_progress)) + return; + + vector = ~get_irq_regs()->orig_ax; + me = smp_processor_id(); + if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { + cpumask_t cleanup_mask; + + cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); + cfg->move_cleanup_count = cpus_weight(cleanup_mask); + send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); cfg->move_in_progress = 0; } } #else static inline void irq_complete_move(unsigned int irq) {} #endif +#ifdef CONFIG_INTR_REMAP +static void ack_x2apic_level(unsigned int irq) +{ + ack_x2APIC_irq(); +} + +static void ack_x2apic_edge(unsigned int irq) +{ + ack_x2APIC_irq(); +} +#endif static void ack_apic_edge(unsigned int irq) { @@ -2118,55 +2439,55 @@ static void ack_apic_edge(unsigned int irq) #ifdef CONFIG_X86_64 static void ack_apic_level(unsigned int irq) { - int do_unmask_irq = 0; + int do_unmask_irq = 0; - irq_complete_move(irq); + irq_complete_move(irq); #ifdef CONFIG_GENERIC_PENDING_IRQ - /* If we are moving the irq we need to mask it */ - if (unlikely(desc->status & IRQ_MOVE_PENDING)) { - do_unmask_irq = 1; - mask_IO_APIC_irq(irq); - } + /* If we are moving the irq we need to mask it */ + if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { + do_unmask_irq = 1; + mask_IO_APIC_irq(irq); + } #endif - /* - * We must acknowledge the irq before we move it or the acknowledge will - * not propagate properly. - */ - ack_APIC_irq(); - - /* Now we can move and renable the irq */ - if (unlikely(do_unmask_irq)) { - /* Only migrate the irq if the ack has been received. - * - * On rare occasions the broadcast level triggered ack gets - * delayed going to ioapics, and if we reprogram the - * vector while Remote IRR is still set the irq will never - * fire again. - * - * To prevent this scenario we read the Remote IRR bit - * of the ioapic. This has two effects. - * - On any sane system the read of the ioapic will - * flush writes (and acks) going to the ioapic from - * this cpu. - * - We get to see if the ACK has actually been delivered. - * - * Based on failed experiments of reprogramming the - * ioapic entry from outside of irq context starting - * with masking the ioapic entry and then polling until - * Remote IRR was clear before reprogramming the - * ioapic I don't trust the Remote IRR bit to be - * completey accurate. - * - * However there appears to be no other way to plug - * this race, so if the Remote IRR bit is not - * accurate and is causing problems then it is a hardware bug - * and you can go talk to the chipset vendor about it. - */ - if (!io_apic_level_ack_pending(irq)) - move_masked_irq(irq, desc); - unmask_IO_APIC_irq(irq); - } + /* + * We must acknowledge the irq before we move it or the acknowledge will + * not propagate properly. + */ + ack_APIC_irq(); + + /* Now we can move and renable the irq */ + if (unlikely(do_unmask_irq)) { + /* Only migrate the irq if the ack has been received. + * + * On rare occasions the broadcast level triggered ack gets + * delayed going to ioapics, and if we reprogram the + * vector while Remote IRR is still set the irq will never + * fire again. + * + * To prevent this scenario we read the Remote IRR bit + * of the ioapic. This has two effects. + * - On any sane system the read of the ioapic will + * flush writes (and acks) going to the ioapic from + * this cpu. + * - We get to see if the ACK has actually been delivered. + * + * Based on failed experiments of reprogramming the + * ioapic entry from outside of irq context starting + * with masking the ioapic entry and then polling until + * Remote IRR was clear before reprogramming the + * ioapic I don't trust the Remote IRR bit to be + * completey accurate. + * + * However there appears to be no other way to plug + * this race, so if the Remote IRR bit is not + * accurate and is causing problems then it is a hardware bug + * and you can go talk to the chipset vendor about it. + */ + if (!io_apic_level_ack_pending(irq)) + move_masked_irq(irq); + unmask_IO_APIC_irq(irq); + } } #else atomic_t irq_mis_count; @@ -2177,25 +2498,25 @@ static void ack_apic_level(unsigned int irq) irq_complete_move(irq); move_native_irq(irq); -/* - * It appears there is an erratum which affects at least version 0x11 - * of I/O APIC (that's the 82093AA and cores integrated into various - * chipsets). Under certain conditions a level-triggered interrupt is - * erroneously delivered as edge-triggered one but the respective IRR - * bit gets set nevertheless. As a result the I/O unit expects an EOI - * message but it will never arrive and further interrupts are blocked - * from the source. The exact reason is so far unknown, but the - * phenomenon was observed when two consecutive interrupt requests - * from a given source get delivered to the same CPU and the source is - * temporarily disabled in between. - * - * A workaround is to simulate an EOI message manually. We achieve it - * by setting the trigger mode to edge and then to level when the edge - * trigger mode gets detected in the TMR of a local APIC for a - * level-triggered interrupt. We mask the source for the time of the - * operation to prevent an edge-triggered interrupt escaping meanwhile. - * The idea is from Manfred Spraul. --macro - */ + /* + * It appears there is an erratum which affects at least version 0x11 + * of I/O APIC (that's the 82093AA and cores integrated into various + * chipsets). Under certain conditions a level-triggered interrupt is + * erroneously delivered as edge-triggered one but the respective IRR + * bit gets set nevertheless. As a result the I/O unit expects an EOI + * message but it will never arrive and further interrupts are blocked + * from the source. The exact reason is so far unknown, but the + * phenomenon was observed when two consecutive interrupt requests + * from a given source get delivered to the same CPU and the source is + * temporarily disabled in between. + * + * A workaround is to simulate an EOI message manually. We achieve it + * by setting the trigger mode to edge and then to level when the edge + * trigger mode gets detected in the TMR of a local APIC for a + * level-triggered interrupt. We mask the source for the time of the + * operation to prevent an edge-triggered interrupt escaping meanwhile. + * The idea is from Manfred Spraul. --macro + */ i = irq_cfg(irq)->vector; v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); @@ -2225,6 +2546,20 @@ static struct irq_chip ioapic_chip __read_mostly = { .retrigger = ioapic_retrigger_irq, }; +#ifdef CONFIG_INTR_REMAP +static struct irq_chip ir_ioapic_chip __read_mostly = { + .name = "IR-IO-APIC", + .startup = startup_ioapic_irq, + .mask = mask_IO_APIC_irq, + .unmask = unmask_IO_APIC_irq, + .ack = ack_x2apic_edge, + .eoi = ack_x2apic_level, +#ifdef CONFIG_SMP + .set_affinity = set_ir_ioapic_affinity_irq, +#endif + .retrigger = ioapic_retrigger_irq, +}; +#endif static inline void init_IO_APIC_traps(void) { @@ -2282,7 +2617,7 @@ static void unmask_lapic_irq(unsigned int irq) apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); } -static void ack_lapic_irq(unsigned int irq) +static void ack_lapic_irq (unsigned int irq) { ack_APIC_irq(); } @@ -2383,12 +2718,12 @@ static inline void __init unlock_ExtINT_logic(void) static int disable_timer_pin_1 __initdata; /* Actually the next is obsolete, but keep it for paranoid reasons -AK */ -static int __init parse_disable_timer_pin_1(char *arg) +static int __init disable_timer_pin_setup(char *arg) { disable_timer_pin_1 = 1; return 0; } -early_param("disable_timer_pin_1", parse_disable_timer_pin_1); +early_param("disable_timer_pin_1", disable_timer_pin_setup); int timer_through_8259 __initdata; @@ -2397,6 +2732,8 @@ int timer_through_8259 __initdata; * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ * is so screwy. Thanks to Brian Perkins for testing/hacking this beast * fanatically on his truly buggy board. + * + * FIXME: really need to revamp this for all platforms. */ static inline void __init check_timer(void) { @@ -2408,8 +2745,8 @@ static inline void __init check_timer(void) local_irq_save(flags); - ver = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(ver); + ver = apic_read(APIC_LVR); + ver = GET_APIC_VERSION(ver); /* * get/set the timer IRQ vector: @@ -2428,7 +2765,9 @@ static inline void __init check_timer(void) */ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); init_8259A(1); +#ifdef CONFIG_X86_32 timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); +#endif pin1 = find_isa_irq_pin(0, mp_INT); apic1 = find_isa_irq_apic(0, mp_INT); @@ -2447,6 +2786,10 @@ static inline void __init check_timer(void) * 8259A. */ if (pin1 == -1) { +#ifdef CONFIG_INTR_REMAP + if (intr_remapping_enabled) + panic("BIOS bug: timer not connected to IO-APIC"); +#endif pin1 = pin2; apic1 = apic2; no_pin1 = 1; @@ -2473,6 +2816,10 @@ static inline void __init check_timer(void) clear_IO_APIC_pin(0, pin1); goto out; } +#ifdef CONFIG_INTR_REMAP + if (intr_remapping_enabled) + panic("timer doesn't work through Interrupt-remapped IO-APIC"); +#endif clear_IO_APIC_pin(apic1, pin1); if (!no_pin1) apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " @@ -2512,7 +2859,9 @@ static inline void __init check_timer(void) "through the IO-APIC - disabling NMI Watchdog!\n"); nmi_watchdog = NMI_NONE; } +#ifdef CONFIG_X86_32 timer_ack = 0; +#endif apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...\n"); @@ -2570,17 +2919,25 @@ out: void __init setup_IO_APIC(void) { + +#ifdef CONFIG_X86_32 enable_IO_APIC(); +#else + /* + * calling enable_IO_APIC() is moved to setup_local_APIC for BP + */ +#endif io_apic_irqs = ~PIC_IRQS; - printk("ENABLING IO-APIC IRQs\n"); - - /* - * Set up IO-APIC IRQ routing. - */ - if (!acpi_ioapic) - setup_ioapic_ids_from_mpc(); + apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); + /* + * Set up IO-APIC IRQ routing. + */ +#ifdef CONFIG_X86_32 + if (!acpi_ioapic) + setup_ioapic_ids_from_mpc(); +#endif sync_Arb_IDs(); setup_IO_APIC_irqs(); init_IO_APIC_traps(); @@ -2588,15 +2945,15 @@ void __init setup_IO_APIC(void) } /* - * Called after all the initialization is done. If we didnt find any - * APIC bugs then we can allow the modify fast path + * Called after all the initialization is done. If we didnt find any + * APIC bugs then we can allow the modify fast path */ static int __init io_apic_bug_finalize(void) { - if (sis_apic_bug == -1) - sis_apic_bug = 0; - return 0; + if (sis_apic_bug == -1) + sis_apic_bug = 0; + return 0; } late_initcall(io_apic_bug_finalize); @@ -2605,7 +2962,7 @@ struct sysfs_ioapic_data { struct sys_device dev; struct IO_APIC_route_entry entry[0]; }; -static struct sysfs_ioapic_data *mp_ioapic_data[MAX_IO_APICS]; +static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; static int ioapic_suspend(struct sys_device *dev, pm_message_t state) { @@ -2615,8 +2972,8 @@ static int ioapic_suspend(struct sys_device *dev, pm_message_t state) data = container_of(dev, struct sysfs_ioapic_data, dev); entry = data->entry; - for (i = 0; i < nr_ioapic_registers[dev->id]; i++) - entry[i] = ioapic_read_entry(dev->id, i); + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) + *entry = ioapic_read_entry(dev->id, i); return 0; } @@ -2653,14 +3010,14 @@ static struct sysdev_class ioapic_sysdev_class = { static int __init ioapic_init_sysfs(void) { - struct sys_device *dev; - int i, size, error = 0; + struct sys_device * dev; + int i, size, error; error = sysdev_class_register(&ioapic_sysdev_class); if (error) return error; - for (i = 0; i < nr_ioapics; i++) { + for (i = 0; i < nr_ioapics; i++ ) { size = sizeof(struct sys_device) + nr_ioapic_registers[i] * sizeof(struct IO_APIC_route_entry); mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL); @@ -2691,18 +3048,18 @@ device_initcall(ioapic_init_sysfs); unsigned int create_irq_nr(unsigned int irq_want) { /* Allocate an unused irq */ - unsigned int irq, new; + unsigned int irq; + unsigned int new; unsigned long flags; struct irq_cfg *cfg_new; #ifndef CONFIG_HAVE_SPARSE_IRQ - /* only can use bus/dev/fn.. when per_cpu vector is used */ irq_want = nr_irqs - 1; #endif irq = 0; spin_lock_irqsave(&vector_lock, flags); - for (new = (nr_irqs - 1); new > 0; new--) { + for (new = irq_want; new > 0; new--) { if (platform_legacy_irq(new)) continue; cfg_new = irq_cfg(new); @@ -2725,7 +3082,14 @@ unsigned int create_irq_nr(unsigned int irq_want) int create_irq(void) { - return create_irq_nr(nr_irqs - 1); + int irq; + + irq = create_irq_nr(nr_irqs - 1); + + if (irq == 0) + irq = -1; + + return irq; } void destroy_irq(unsigned int irq) @@ -2734,6 +3098,9 @@ void destroy_irq(unsigned int irq) dynamic_irq_cleanup(irq); +#ifdef CONFIG_INTR_REMAP + free_irte(irq); +#endif spin_lock_irqsave(&vector_lock, flags); __clear_irq_vector(irq); spin_unlock_irqrestore(&vector_lock, flags); @@ -2759,25 +3126,54 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms cpus_and(tmp, cfg->domain, tmp); dest = cpu_mask_to_apicid(tmp); - msg->address_hi = MSI_ADDR_BASE_HI; - msg->address_lo = - MSI_ADDR_BASE_LO | - ((INT_DEST_MODE == 0) ? - MSI_ADDR_DEST_MODE_PHYSICAL: - MSI_ADDR_DEST_MODE_LOGICAL) | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? - MSI_ADDR_REDIRECTION_CPU: - MSI_ADDR_REDIRECTION_LOWPRI) | - MSI_ADDR_DEST_ID(dest); - - msg->data = - MSI_DATA_TRIGGER_EDGE | - MSI_DATA_LEVEL_ASSERT | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? - MSI_DATA_DELIVERY_FIXED: - MSI_DATA_DELIVERY_LOWPRI) | - MSI_DATA_VECTOR(cfg->vector); +#ifdef CONFIG_INTR_REMAP + if (irq_remapped(irq)) { + struct irte irte; + int ir_index; + u16 sub_handle; + + ir_index = map_irq_to_irte_handle(irq, &sub_handle); + BUG_ON(ir_index == -1); + + memset (&irte, 0, sizeof(irte)); + + irte.present = 1; + irte.dst_mode = INT_DEST_MODE; + irte.trigger_mode = 0; /* edge */ + irte.dlvry_mode = INT_DELIVERY_MODE; + irte.vector = cfg->vector; + irte.dest_id = IRTE_DEST(dest); + + modify_irte(irq, &irte); + + msg->address_hi = MSI_ADDR_BASE_HI; + msg->data = sub_handle; + msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT | + MSI_ADDR_IR_SHV | + MSI_ADDR_IR_INDEX1(ir_index) | + MSI_ADDR_IR_INDEX2(ir_index); + } else +#endif + { + msg->address_hi = MSI_ADDR_BASE_HI; + msg->address_lo = + MSI_ADDR_BASE_LO | + ((INT_DEST_MODE == 0) ? + MSI_ADDR_DEST_MODE_PHYSICAL: + MSI_ADDR_DEST_MODE_LOGICAL) | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + MSI_ADDR_REDIRECTION_CPU: + MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_DEST_ID(dest); + msg->data = + MSI_DATA_TRIGGER_EDGE | + MSI_DATA_LEVEL_ASSERT | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + MSI_DATA_DELIVERY_FIXED: + MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_VECTOR(cfg->vector); + } return err; } @@ -2788,6 +3184,7 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) struct msi_msg msg; unsigned int dest; cpumask_t tmp; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2808,8 +3205,61 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); write_msi_msg(irq, &msg); - irq_to_desc(irq)->affinity = mask; + desc = irq_to_desc(irq); + desc->affinity = mask; } + +#ifdef CONFIG_INTR_REMAP +/* + * Migrate the MSI irq to another cpumask. This migration is + * done in the process context using interrupt-remapping hardware. + */ +static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) +{ + struct irq_cfg *cfg; + unsigned int dest; + cpumask_t tmp, cleanup_mask; + struct irte irte; + struct irq_desc *desc; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (get_irte(irq, &irte)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + irte.vector = cfg->vector; + irte.dest_id = IRTE_DEST(dest); + + /* + * atomically update the IRTE with the new destination and vector. + */ + modify_irte(irq, &irte); + + /* + * After this point, all the interrupts will start arriving + * at the new destination. So, time to cleanup the previous + * vector allocation. + */ + if (cfg->move_in_progress) { + cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); + cfg->move_cleanup_count = cpus_weight(cleanup_mask); + send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + cfg->move_in_progress = 0; + } + + desc = irq_to_desc(irq); + desc->affinity = mask; +} +#endif #endif /* CONFIG_SMP */ /* @@ -2827,6 +3277,45 @@ static struct irq_chip msi_chip = { .retrigger = ioapic_retrigger_irq, }; +#ifdef CONFIG_INTR_REMAP +static struct irq_chip msi_ir_chip = { + .name = "IR-PCI-MSI", + .unmask = unmask_msi_irq, + .mask = mask_msi_irq, + .ack = ack_x2apic_edge, +#ifdef CONFIG_SMP + .set_affinity = ir_set_msi_irq_affinity, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +/* + * Map the PCI dev to the corresponding remapping hardware unit + * and allocate 'nvec' consecutive interrupt-remapping table entries + * in it. + */ +static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) +{ + struct intel_iommu *iommu; + int index; + + iommu = map_dev_to_ir(dev); + if (!iommu) { + printk(KERN_ERR + "Unable to map PCI %s to iommu\n", pci_name(dev)); + return -ENOENT; + } + + index = alloc_irte(iommu, irq, nvec); + if (index < 0) { + printk(KERN_ERR + "Unable to allocate %d IRTE for PCI %s\n", nvec, + pci_name(dev)); + return -ENOSPC; + } + return index; +} +#endif static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) { @@ -2840,7 +3329,17 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) set_irq_msi(irq, desc); write_msi_msg(irq, &msg); - set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); +#ifdef CONFIG_INTR_REMAP + if (irq_remapped(irq)) { + struct irq_desc *desc = irq_to_desc(irq); + /* + * irq migration in process context + */ + desc->status |= IRQ_MOVE_PCNTXT; + set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); + } else +#endif + set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); return 0; } @@ -2859,59 +3358,164 @@ static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) { - int irq, ret; - + unsigned int irq; + int ret; unsigned int irq_want; irq_want = build_irq_for_pci_dev(dev) + 0x100; irq = create_irq_nr(irq_want); - if (irq == 0) return -1; +#ifdef CONFIG_INTR_REMAP + if (!intr_remapping_enabled) + goto no_ir; + + ret = msi_alloc_irte(dev, irq, 1); + if (ret < 0) + goto error; +no_ir: +#endif ret = setup_msi_irq(dev, desc, irq); if (ret < 0) { destroy_irq(irq); return ret; - } - + } return 0; + +#ifdef CONFIG_INTR_REMAP +error: + destroy_irq(irq); + return ret; +#endif } int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { - unsigned int irq; - int ret, sub_handle; - struct msi_desc *desc; - unsigned int irq_want; - - irq_want = build_irq_for_pci_dev(dev) + 0x100; - sub_handle = 0; - list_for_each_entry(desc, &dev->msi_list, list) { - irq = create_irq_nr(irq_want--); - if (irq == 0) - return -1; - ret = setup_msi_irq(dev, desc, irq); - if (ret < 0) - goto error; - sub_handle++; - } - return 0; + unsigned int irq; + int ret, sub_handle; + struct msi_desc *desc; + unsigned int irq_want; + +#ifdef CONFIG_INTR_REMAP + struct intel_iommu *iommu = 0; + int index = 0; +#endif + + irq_want = build_irq_for_pci_dev(dev) + 0x100; + sub_handle = 0; + list_for_each_entry(desc, &dev->msi_list, list) { + irq = create_irq_nr(irq_want--); + if (irq == 0) + return -1; +#ifdef CONFIG_INTR_REMAP + if (!intr_remapping_enabled) + goto no_ir; + + if (!sub_handle) { + /* + * allocate the consecutive block of IRTE's + * for 'nvec' + */ + index = msi_alloc_irte(dev, irq, nvec); + if (index < 0) { + ret = index; + goto error; + } + } else { + iommu = map_dev_to_ir(dev); + if (!iommu) { + ret = -ENOENT; + goto error; + } + /* + * setup the mapping between the irq and the IRTE + * base index, the sub_handle pointing to the + * appropriate interrupt remap table entry. + */ + set_irte_irq(irq, iommu, index, sub_handle); + } +no_ir: +#endif + ret = setup_msi_irq(dev, desc, irq); + if (ret < 0) + goto error; + sub_handle++; + } + return 0; error: - destroy_irq(irq); - return ret; + destroy_irq(irq); + return ret; } - void arch_teardown_msi_irq(unsigned int irq) { destroy_irq(irq); } -#endif /* CONFIG_PCI_MSI */ +#ifdef CONFIG_DMAR +#ifdef CONFIG_SMP +static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) +{ + struct irq_cfg *cfg; + struct msi_msg msg; + unsigned int dest; + cpumask_t tmp; + struct irq_desc *desc; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + dmar_msi_read(irq, &msg); + + msg.data &= ~MSI_DATA_VECTOR_MASK; + msg.data |= MSI_DATA_VECTOR(cfg->vector); + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg.address_lo |= MSI_ADDR_DEST_ID(dest); + + dmar_msi_write(irq, &msg); + desc = irq_to_desc(irq); + desc->affinity = mask; +} +#endif /* CONFIG_SMP */ + +struct irq_chip dmar_msi_type = { + .name = "DMAR_MSI", + .unmask = dmar_msi_unmask, + .mask = dmar_msi_mask, + .ack = ack_apic_edge, +#ifdef CONFIG_SMP + .set_affinity = dmar_msi_set_affinity, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +int arch_setup_dmar_msi(unsigned int irq) +{ + int ret; + struct msi_msg msg; + ret = msi_compose_msg(NULL, irq, &msg); + if (ret < 0) + return ret; + dmar_msi_write(irq, &msg); + set_irq_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, + "edge"); + return 0; +} +#endif + +#endif /* CONFIG_PCI_MSI */ /* * Hypertransport interrupt support */ @@ -2938,6 +3542,7 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) struct irq_cfg *cfg; unsigned int dest; cpumask_t tmp; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2951,7 +3556,8 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) dest = cpu_mask_to_apicid(tmp); target_ht_irq(irq, dest, cfg->vector); - irq_to_desc(irq)->affinity = mask; + desc = irq_to_desc(irq); + desc->affinity = mask; } #endif @@ -2974,7 +3580,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) tmp = TARGET_CPUS; err = assign_irq_vector(irq, tmp); - if ( !err) { + if (!err) { struct ht_irq_msg msg; unsigned dest; @@ -3007,11 +3613,12 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) #endif /* CONFIG_HT_IRQ */ /* -------------------------------------------------------------------------- - ACPI-based IOAPIC Configuration + ACPI-based IOAPIC Configuration -------------------------------------------------------------------------- */ #ifdef CONFIG_ACPI +#ifdef CONFIG_X86_32 int __init io_apic_get_unique_id(int ioapic, int apic_id) { union IO_APIC_reg_00 reg_00; @@ -3086,7 +3693,6 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) return apic_id; } - int __init io_apic_get_version(int ioapic) { union IO_APIC_reg_01 reg_01; @@ -3098,9 +3704,9 @@ int __init io_apic_get_version(int ioapic) return reg_01.bits.version; } +#endif - -int __init io_apic_get_redir_entries(int ioapic) +int __init io_apic_get_redir_entries (int ioapic) { union IO_APIC_reg_01 reg_01; unsigned long flags; @@ -3113,10 +3719,10 @@ int __init io_apic_get_redir_entries(int ioapic) } -int io_apic_set_pci_routing(int ioapic, int pin, int irq, int triggering, int polarity) +int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) { if (!IO_APIC_IRQ(irq)) { - printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", + apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", ioapic); return -EINVAL; } @@ -3132,6 +3738,7 @@ int io_apic_set_pci_routing(int ioapic, int pin, int irq, int triggering, int po return 0; } + int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) { int i; @@ -3163,7 +3770,6 @@ void __init setup_ioapic_dest(void) { int pin, ioapic, irq, irq_entry; struct irq_cfg *cfg; - struct irq_desc *desc; if (skip_ioapic_setup == 1) return; @@ -3184,43 +3790,124 @@ void __init setup_ioapic_dest(void) setup_IO_APIC_irq(ioapic, pin, irq, irq_trigger(irq_entry), irq_polarity(irq_entry)); - else { - desc = irq_to_desc(irq); +#ifdef CONFIG_INTR_REMAP + else if (intr_remapping_enabled) + set_ir_ioapic_affinity_irq(irq, TARGET_CPUS); +#endif + else set_ioapic_affinity_irq(irq, TARGET_CPUS); - } } } } #endif +#ifdef CONFIG_X86_64 +#define IOAPIC_RESOURCE_NAME_SIZE 11 + +static struct resource *ioapic_resources; + +static struct resource * __init ioapic_setup_resources(void) +{ + unsigned long n; + struct resource *res; + char *mem; + int i; + + if (nr_ioapics <= 0) + return NULL; + + n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); + n *= nr_ioapics; + + mem = alloc_bootmem(n); + res = (void *)mem; + + if (mem != NULL) { + mem += sizeof(struct resource) * nr_ioapics; + + for (i = 0; i < nr_ioapics; i++) { + res[i].name = mem; + res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; + sprintf(mem, "IOAPIC %u", i); + mem += IOAPIC_RESOURCE_NAME_SIZE; + } + } + + ioapic_resources = res; + + return res; +} +#endif + void __init ioapic_init_mappings(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; int i; +#ifdef CONFIG_X86_64 + struct resource *ioapic_res; + ioapic_res = ioapic_setup_resources(); +#endif for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { ioapic_phys = mp_ioapics[i].mp_apicaddr; - if (!ioapic_phys) { - printk(KERN_ERR - "WARNING: bogus zero IO-APIC " - "address found in MPTABLE, " - "disabling IO/APIC support!\n"); - smp_found_config = 0; - skip_ioapic_setup = 1; - goto fake_ioapic_page; - } +#ifdef CONFIG_X86_32 + if (!ioapic_phys) { + printk(KERN_ERR + "WARNING: bogus zero IO-APIC " + "address found in MPTABLE, " + "disabling IO/APIC support!\n"); + smp_found_config = 0; + skip_ioapic_setup = 1; + goto fake_ioapic_page; + } +#endif } else { +#ifdef CONFIG_X86_32 fake_ioapic_page: +#endif ioapic_phys = (unsigned long) - alloc_bootmem_pages(PAGE_SIZE); + alloc_bootmem_pages(PAGE_SIZE); ioapic_phys = __pa(ioapic_phys); } set_fixmap_nocache(idx, ioapic_phys); - printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n", - __fix_to_virt(idx), ioapic_phys); + apic_printk(APIC_VERBOSE, + "mapped IOAPIC to %08lx (%08lx)\n", + __fix_to_virt(idx), ioapic_phys); idx++; + +#ifdef CONFIG_X86_64 + if (ioapic_res != NULL) { + ioapic_res->start = ioapic_phys; + ioapic_res->end = ioapic_phys + (4 * 1024) - 1; + ioapic_res++; + } +#endif } } +#ifdef CONFIG_X86_64 +static int __init ioapic_insert_resources(void) +{ + int i; + struct resource *r = ioapic_resources; + + if (!r) { + printk(KERN_ERR + "IO APIC resources could be not be allocated.\n"); + return -1; + } + + for (i = 0; i < nr_ioapics; i++) { + insert_resource(&iomem_resource, r); + r++; + } + + return 0; +} + +/* Insert the IO APIC resources after PCI initialization has occured to handle + * IO APICS that are mapped in on a BAR in PCI space. */ +late_initcall(ioapic_insert_resources); +#endif -- cgit v1.2.3-70-g09d2 From 26d347c2c035b1f4c5b3c5094f3046db9ec920f5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:42 -0700 Subject: rename io_apic_64.c and io_apic_32.c to io_apic.c The two files are now line by line equal. (sans a printk) Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/io_apic.c | 3913 ++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/io_apic_32.c | 3913 ------------------------------------------ arch/x86/kernel/io_apic_64.c | 3913 ------------------------------------------ 4 files changed, 3914 insertions(+), 7827 deletions(-) create mode 100644 arch/x86/kernel/io_apic.c delete mode 100644 arch/x86/kernel/io_apic_32.c delete mode 100644 arch/x86/kernel/io_apic_64.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0d41f0343dc..7264f9c3af8 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -61,7 +61,7 @@ obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi.o -obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o +obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c new file mode 100644 index 00000000000..fba6d6ee348 --- /dev/null +++ b/arch/x86/kernel/io_apic.c @@ -0,0 +1,3913 @@ +/* + * Intel IO-APIC support for multi-Pentium hosts. + * + * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo + * + * Many thanks to Stig Venaas for trying out countless experimental + * patches and reporting/debugging problems patiently! + * + * (c) 1999, Multiple IO-APIC support, developed by + * Ken-ichi Yaku and + * Hidemi Kishimoto , + * further tested and cleaned up by Zach Brown + * and Ingo Molnar + * + * Fixes + * Maciej W. Rozycki : Bits for genuine 82489DX APICs; + * thanks to Eric Gilmore + * and Rolf G. Tews + * for testing these extensively + * Paul Diefenbaugh : Added full ACPI support + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* time_after() */ +#ifdef CONFIG_ACPI +#include +#endif +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define __apicdebuginit(type) static type __init + +/* + * Is the SiS APIC rmw bug present ? + * -1 = don't know, 0 = no, 1 = yes + */ +int sis_apic_bug = -1; + +static DEFINE_SPINLOCK(ioapic_lock); +static DEFINE_SPINLOCK(vector_lock); + +int first_free_entry; +/* + * Rough estimation of how many shared IRQs there are, can + * be changed anytime. + */ +int pin_map_size; + +/* + * # of IRQ routing registers + */ +int nr_ioapic_registers[MAX_IO_APICS]; + +/* I/O APIC entries */ +struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; +int nr_ioapics; + +/* MP IRQ source entries */ +struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; + +/* # of MP IRQ source entries */ +int mp_irq_entries; + +#if defined (CONFIG_MCA) || defined (CONFIG_EISA) +int mp_bus_id_to_type[MAX_MP_BUSSES]; +#endif + +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); + +int skip_ioapic_setup; + +static int __init parse_noapic(char *str) +{ + /* disable IO-APIC */ + disable_ioapic_setup(); + return 0; +} +early_param("noapic", parse_noapic); + +struct irq_cfg; +struct irq_pin_list; +struct irq_cfg { + unsigned int irq; + struct irq_cfg *next; + struct irq_pin_list *irq_2_pin; + cpumask_t domain; + cpumask_t old_domain; + unsigned move_cleanup_count; + u8 vector; + u8 move_in_progress : 1; +}; + +/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ +static struct irq_cfg irq_cfg_legacy[] __initdata = { + [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, + [1] = { .irq = 1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, + [2] = { .irq = 2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, + [3] = { .irq = 3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, }, + [4] = { .irq = 4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, }, + [5] = { .irq = 5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, }, + [6] = { .irq = 6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, }, + [7] = { .irq = 7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, }, + [8] = { .irq = 8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, }, + [9] = { .irq = 9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, }, + [10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, + [11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, + [12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, + [13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, + [14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, + [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, +}; + +static struct irq_cfg irq_cfg_init = { .irq = -1U, }; +/* need to be biger than size of irq_cfg_legacy */ +static int nr_irq_cfg = 32; + +static int __init parse_nr_irq_cfg(char *arg) +{ + if (arg) { + nr_irq_cfg = simple_strtoul(arg, NULL, 0); + if (nr_irq_cfg < 32) + nr_irq_cfg = 32; + } + return 0; +} + +early_param("nr_irq_cfg", parse_nr_irq_cfg); + +static void init_one_irq_cfg(struct irq_cfg *cfg) +{ + memcpy(cfg, &irq_cfg_init, sizeof(struct irq_cfg)); +} + +static struct irq_cfg *irq_cfgx; +static struct irq_cfg *irq_cfgx_free; +static void __init init_work(void *data) +{ + struct dyn_array *da = data; + struct irq_cfg *cfg; + int legacy_count; + int i; + + cfg = *da->name; + + memcpy(cfg, irq_cfg_legacy, sizeof(irq_cfg_legacy)); + + legacy_count = sizeof(irq_cfg_legacy)/sizeof(irq_cfg_legacy[0]); + for (i = legacy_count; i < *da->nr; i++) + init_one_irq_cfg(&cfg[i]); + + for (i = 1; i < *da->nr; i++) + cfg[i-1].next = &cfg[i]; + + irq_cfgx_free = &irq_cfgx[legacy_count]; + irq_cfgx[legacy_count - 1].next = NULL; +} + +#define for_each_irq_cfg(cfg) \ + for (cfg = irq_cfgx; cfg; cfg = cfg->next) + +DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irq_cfg, PAGE_SIZE, init_work); + +static struct irq_cfg *irq_cfg(unsigned int irq) +{ + struct irq_cfg *cfg; + + cfg = irq_cfgx; + while (cfg) { + if (cfg->irq == irq) + return cfg; + + cfg = cfg->next; + } + + return NULL; +} + +static struct irq_cfg *irq_cfg_alloc(unsigned int irq) +{ + struct irq_cfg *cfg, *cfg_pri; + int i; + int count = 0; + + cfg_pri = cfg = irq_cfgx; + while (cfg) { + if (cfg->irq == irq) + return cfg; + + cfg_pri = cfg; + cfg = cfg->next; + count++; + } + + if (!irq_cfgx_free) { + unsigned long phys; + unsigned long total_bytes; + /* + * we run out of pre-allocate ones, allocate more + */ + printk(KERN_DEBUG "try to get more irq_cfg %d\n", nr_irq_cfg); + + total_bytes = sizeof(struct irq_cfg) * nr_irq_cfg; + if (after_bootmem) + cfg = kzalloc(total_bytes, GFP_ATOMIC); + else + cfg = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0); + + if (!cfg) + panic("please boot with nr_irq_cfg= %d\n", count * 2); + + phys = __pa(cfg); + printk(KERN_DEBUG "irq_irq ==> [%#lx - %#lx]\n", phys, phys + total_bytes); + + for (i = 0; i < nr_irq_cfg; i++) + init_one_irq_cfg(&cfg[i]); + + for (i = 1; i < nr_irq_cfg; i++) + cfg[i-1].next = &cfg[i]; + + irq_cfgx_free = cfg; + } + + cfg = irq_cfgx_free; + irq_cfgx_free = irq_cfgx_free->next; + cfg->next = NULL; + if (cfg_pri) + cfg_pri->next = cfg; + else + irq_cfgx = cfg; + cfg->irq = irq; + printk(KERN_DEBUG "found new irq_cfg for irq %d\n", cfg->irq); +#ifdef CONFIG_HAVE_SPARSE_IRQ_DEBUG + { + /* dump the results */ + struct irq_cfg *cfg; + unsigned long phys; + unsigned long bytes = sizeof(struct irq_cfg); + + printk(KERN_DEBUG "=========================== %d\n", irq); + printk(KERN_DEBUG "irq_cfg dump after get that for %d\n", irq); + for_each_irq_cfg(cfg) { + phys = __pa(cfg); + printk(KERN_DEBUG "irq_cfg %d ==> [%#lx - %#lx]\n", cfg->irq, phys, phys + bytes); + } + printk(KERN_DEBUG "===========================\n"); + } +#endif + return cfg; +} + +/* + * This is performance-critical, we want to do it O(1) + * + * the indexing order of this array favors 1:1 mappings + * between pins and IRQs. + */ + +struct irq_pin_list { + int apic, pin; + struct irq_pin_list *next; +}; + +static struct irq_pin_list *irq_2_pin_head; +/* fill one page ? */ +static int nr_irq_2_pin = 0x100; +static struct irq_pin_list *irq_2_pin_ptr; +static void __init irq_2_pin_init_work(void *data) +{ + struct dyn_array *da = data; + struct irq_pin_list *pin; + int i; + + pin = *da->name; + + for (i = 1; i < *da->nr; i++) + pin[i-1].next = &pin[i]; + + irq_2_pin_ptr = &pin[0]; +} +DEFINE_DYN_ARRAY(irq_2_pin_head, sizeof(struct irq_pin_list), nr_irq_2_pin, PAGE_SIZE, irq_2_pin_init_work); + +static struct irq_pin_list *get_one_free_irq_2_pin(void) +{ + struct irq_pin_list *pin; + int i; + + pin = irq_2_pin_ptr; + + if (pin) { + irq_2_pin_ptr = pin->next; + pin->next = NULL; + return pin; + } + + /* + * we run out of pre-allocate ones, allocate more + */ + printk(KERN_DEBUG "try to get more irq_2_pin %d\n", nr_irq_2_pin); + + if (after_bootmem) + pin = kzalloc(sizeof(struct irq_pin_list)*nr_irq_2_pin, + GFP_ATOMIC); + else + pin = __alloc_bootmem_nopanic(sizeof(struct irq_pin_list) * + nr_irq_2_pin, PAGE_SIZE, 0); + + if (!pin) + panic("can not get more irq_2_pin\n"); + + for (i = 1; i < nr_irq_2_pin; i++) + pin[i-1].next = &pin[i]; + + irq_2_pin_ptr = pin->next; + pin->next = NULL; + + return pin; +} + +struct io_apic { + unsigned int index; + unsigned int unused[3]; + unsigned int data; +}; + +static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) +{ + return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) + + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK); +} + +static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); + return readl(&io_apic->data); +} + +static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); + writel(value, &io_apic->data); +} + +/* + * Re-write a value: to be used for read-modify-write + * cycles where the read already set up the index register. + * + * Older SiS APIC requires we rewrite the index register + */ +static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + if (sis_apic_bug) + writel(reg, &io_apic->index); + writel(value, &io_apic->data); +} + +#ifdef CONFIG_X86_64 +static bool io_apic_level_ack_pending(unsigned int irq) +{ + struct irq_pin_list *entry; + unsigned long flags; + struct irq_cfg *cfg = irq_cfg(irq); + + spin_lock_irqsave(&ioapic_lock, flags); + entry = cfg->irq_2_pin; + for (;;) { + unsigned int reg; + int pin; + + if (!entry) + break; + pin = entry->pin; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + /* Is the remote IRR bit set? */ + if (reg & IO_APIC_REDIR_REMOTE_IRR) { + spin_unlock_irqrestore(&ioapic_lock, flags); + return true; + } + if (!entry->next) + break; + entry = entry->next; + } + spin_unlock_irqrestore(&ioapic_lock, flags); + + return false; +} +#endif + +union entry_union { + struct { u32 w1, w2; }; + struct IO_APIC_route_entry entry; +}; + +static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) +{ + union entry_union eu; + unsigned long flags; + spin_lock_irqsave(&ioapic_lock, flags); + eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); + eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + return eu.entry; +} + +/* + * When we write a new IO APIC routing entry, we need to write the high + * word first! If the mask bit in the low word is clear, we will enable + * the interrupt, and we need to make sure the entry is fully populated + * before that happens. + */ +static void +__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +{ + union entry_union eu; + eu.entry = e; + io_apic_write(apic, 0x11 + 2*pin, eu.w2); + io_apic_write(apic, 0x10 + 2*pin, eu.w1); +} + +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +{ + unsigned long flags; + spin_lock_irqsave(&ioapic_lock, flags); + __ioapic_write_entry(apic, pin, e); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +/* + * When we mask an IO APIC routing entry, we need to write the low + * word first, in order to set the mask bit before we change the + * high bits! + */ +static void ioapic_mask_entry(int apic, int pin) +{ + unsigned long flags; + union entry_union eu = { .entry.mask = 1 }; + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x10 + 2*pin, eu.w1); + io_apic_write(apic, 0x11 + 2*pin, eu.w2); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +#ifdef CONFIG_SMP +static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) +{ + int apic, pin; + struct irq_cfg *cfg; + struct irq_pin_list *entry; + + cfg = irq_cfg(irq); + entry = cfg->irq_2_pin; + for (;;) { + unsigned int reg; + + if (!entry) + break; + + apic = entry->apic; + pin = entry->pin; +#ifdef CONFIG_INTR_REMAP + /* + * With interrupt-remapping, destination information comes + * from interrupt-remapping table entry. + */ + if (!irq_remapped(irq)) + io_apic_write(apic, 0x11 + pin*2, dest); +#else + io_apic_write(apic, 0x11 + pin*2, dest); +#endif + reg = io_apic_read(apic, 0x10 + pin*2); + reg &= ~IO_APIC_REDIR_VECTOR_MASK; + reg |= vector; + io_apic_modify(apic, 0x10 + pin*2, reg); + if (!entry->next) + break; + entry = entry->next; + } +} + +static int assign_irq_vector(int irq, cpumask_t mask); + +static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) +{ + struct irq_cfg *cfg; + unsigned long flags; + unsigned int dest; + cpumask_t tmp; + struct irq_desc *desc; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + cfg = irq_cfg(irq); + if (assign_irq_vector(irq, mask)) + return; + + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + /* + * Only the high 8 bits are valid. + */ + dest = SET_APIC_LOGICAL_ID(dest); + + desc = irq_to_desc(irq); + spin_lock_irqsave(&ioapic_lock, flags); + __target_IO_APIC_irq(irq, dest, cfg->vector); + desc->affinity = mask; + spin_unlock_irqrestore(&ioapic_lock, flags); +} +#endif /* CONFIG_SMP */ + +/* + * The common case is 1:1 IRQ<->pin mappings. Sometimes there are + * shared ISA-space IRQs, so we have to support them. We are super + * fast in the common case, and fast for shared ISA-space IRQs. + */ +static void add_pin_to_irq(unsigned int irq, int apic, int pin) +{ + struct irq_cfg *cfg; + struct irq_pin_list *entry; + + /* first time to refer irq_cfg, so with new */ + cfg = irq_cfg_alloc(irq); + entry = cfg->irq_2_pin; + if (!entry) { + entry = get_one_free_irq_2_pin(); + cfg->irq_2_pin = entry; + entry->apic = apic; + entry->pin = pin; + printk(KERN_DEBUG " 0 add_pin_to_irq: irq %d --> apic %d pin %d\n", irq, apic, pin); + return; + } + + while (entry->next) { + /* not again, please */ + if (entry->apic == apic && entry->pin == pin) + return; + + entry = entry->next; + } + + entry->next = get_one_free_irq_2_pin(); + entry = entry->next; + entry->apic = apic; + entry->pin = pin; + printk(KERN_DEBUG " x add_pin_to_irq: irq %d --> apic %d pin %d\n", irq, apic, pin); +} + +/* + * Reroute an IRQ to a different pin. + */ +static void __init replace_pin_at_irq(unsigned int irq, + int oldapic, int oldpin, + int newapic, int newpin) +{ + struct irq_cfg *cfg = irq_cfg(irq); + struct irq_pin_list *entry = cfg->irq_2_pin; + int replaced = 0; + + while (entry) { + if (entry->apic == oldapic && entry->pin == oldpin) { + entry->apic = newapic; + entry->pin = newpin; + replaced = 1; + /* every one is different, right? */ + break; + } + entry = entry->next; + } + + /* why? call replace before add? */ + if (!replaced) + add_pin_to_irq(irq, newapic, newpin); +} + +#ifdef CONFIG_X86_64 +/* + * Synchronize the IO-APIC and the CPU by doing + * a dummy read from the IO-APIC + */ +static inline void io_apic_sync(unsigned int apic) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + readl(&io_apic->data); +} + +#define __DO_ACTION(R, ACTION, FINAL) \ + \ +{ \ + int pin; \ + struct irq_cfg *cfg; \ + struct irq_pin_list *entry; \ + \ + cfg = irq_cfg(irq); \ + entry = cfg->irq_2_pin; \ + for (;;) { \ + unsigned int reg; \ + if (!entry) \ + break; \ + pin = entry->pin; \ + reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ + reg ACTION; \ + io_apic_modify(entry->apic, 0x10 + R + pin*2, reg); \ + FINAL; \ + if (!entry->next) \ + break; \ + entry = entry->next; \ + } \ +} + +#define DO_ACTION(name,R,ACTION, FINAL) \ + \ + static void name##_IO_APIC_irq (unsigned int irq) \ + __DO_ACTION(R, ACTION, FINAL) + +/* mask = 1 */ +DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic)) + +/* mask = 0 */ +DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, ) + +#else + +static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable) +{ + struct irq_cfg *cfg; + struct irq_pin_list *entry; + unsigned int pin, reg; + + cfg = irq_cfg(irq); + entry = cfg->irq_2_pin; + for (;;) { + if (!entry) + break; + pin = entry->pin; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + reg &= ~disable; + reg |= enable; + io_apic_modify(entry->apic, 0x10 + pin*2, reg); + if (!entry->next) + break; + entry = entry->next; + } +} + +/* mask = 1 */ +static void __mask_IO_APIC_irq(unsigned int irq) +{ + __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0); +} + +/* mask = 0 */ +static void __unmask_IO_APIC_irq(unsigned int irq) +{ + __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED); +} + +/* mask = 1, trigger = 0 */ +static void __mask_and_edge_IO_APIC_irq(unsigned int irq) +{ + __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, + IO_APIC_REDIR_LEVEL_TRIGGER); +} + +/* mask = 0, trigger = 1 */ +static void __unmask_and_level_IO_APIC_irq(unsigned int irq) +{ + __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER, + IO_APIC_REDIR_MASKED); +} + +#endif + +static void mask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __mask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void unmask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __unmask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) +{ + struct IO_APIC_route_entry entry; + + /* Check delivery_mode to be sure we're not clearing an SMI pin */ + entry = ioapic_read_entry(apic, pin); + if (entry.delivery_mode == dest_SMI) + return; + /* + * Disable it in the IO-APIC irq-routing table: + */ + ioapic_mask_entry(apic, pin); +} + +static void clear_IO_APIC (void) +{ + int apic, pin; + + for (apic = 0; apic < nr_ioapics; apic++) + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) + clear_IO_APIC_pin(apic, pin); +} + +#if !defined(CONFIG_SMP) && defined(CONFIG_X86_32) +void send_IPI_self(int vector) +{ + unsigned int cfg; + + /* + * Wait for idle. + */ + apic_wait_icr_idle(); + cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL; + /* + * Send the IPI. The write to APIC_ICR fires this off. + */ + apic_write(APIC_ICR, cfg); +} +#endif /* !CONFIG_SMP && CONFIG_X86_32*/ + +#ifdef CONFIG_X86_32 +/* + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to + * specific CPU-side IRQs. + */ + +#define MAX_PIRQS 8 +static int pirq_entries [MAX_PIRQS]; +static int pirqs_enabled; + +static int __init ioapic_pirq_setup(char *str) +{ + int i, max; + int ints[MAX_PIRQS+1]; + + get_options(str, ARRAY_SIZE(ints), ints); + + for (i = 0; i < MAX_PIRQS; i++) + pirq_entries[i] = -1; + + pirqs_enabled = 1; + apic_printk(APIC_VERBOSE, KERN_INFO + "PIRQ redirection, working around broken MP-BIOS.\n"); + max = MAX_PIRQS; + if (ints[0] < MAX_PIRQS) + max = ints[0]; + + for (i = 0; i < max; i++) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); + /* + * PIRQs are mapped upside down, usually. + */ + pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; + } + return 1; +} + +__setup("pirq=", ioapic_pirq_setup); +#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_INTR_REMAP +/* I/O APIC RTE contents at the OS boot up */ +static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS]; + +/* + * Saves and masks all the unmasked IO-APIC RTE's + */ +int save_mask_IO_APIC_setup(void) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + int apic, pin; + + /* + * The number of IO-APIC IRQ registers (== #pins): + */ + for (apic = 0; apic < nr_ioapics; apic++) { + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(apic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + nr_ioapic_registers[apic] = reg_01.bits.entries+1; + } + + for (apic = 0; apic < nr_ioapics; apic++) { + early_ioapic_entries[apic] = + kzalloc(sizeof(struct IO_APIC_route_entry) * + nr_ioapic_registers[apic], GFP_KERNEL); + if (!early_ioapic_entries[apic]) + return -ENOMEM; + } + + for (apic = 0; apic < nr_ioapics; apic++) + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + struct IO_APIC_route_entry entry; + + entry = early_ioapic_entries[apic][pin] = + ioapic_read_entry(apic, pin); + if (!entry.mask) { + entry.mask = 1; + ioapic_write_entry(apic, pin, entry); + } + } + return 0; +} + +void restore_IO_APIC_setup(void) +{ + int apic, pin; + + for (apic = 0; apic < nr_ioapics; apic++) + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) + ioapic_write_entry(apic, pin, + early_ioapic_entries[apic][pin]); +} + +void reinit_intr_remapped_IO_APIC(int intr_remapping) +{ + /* + * for now plain restore of previous settings. + * TBD: In the case of OS enabling interrupt-remapping, + * IO-APIC RTE's need to be setup to point to interrupt-remapping + * table entries. for now, do a plain restore, and wait for + * the setup_IO_APIC_irqs() to do proper initialization. + */ + restore_IO_APIC_setup(); +} +#endif + +/* + * Find the IRQ entry number of a certain pin. + */ +static int find_irq_entry(int apic, int pin, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].mp_irqtype == type && + (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid || + mp_irqs[i].mp_dstapic == MP_APIC_ALL) && + mp_irqs[i].mp_dstirq == pin) + return i; + + return -1; +} + +/* + * Find the pin to which IRQ[irq] (ISA) is connected + */ +static int __init find_isa_irq_pin(int irq, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].mp_srcbus; + + if (test_bit(lbus, mp_bus_not_pci) && + (mp_irqs[i].mp_irqtype == type) && + (mp_irqs[i].mp_srcbusirq == irq)) + + return mp_irqs[i].mp_dstirq; + } + return -1; +} + +static int __init find_isa_irq_apic(int irq, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].mp_srcbus; + + if (test_bit(lbus, mp_bus_not_pci) && + (mp_irqs[i].mp_irqtype == type) && + (mp_irqs[i].mp_srcbusirq == irq)) + break; + } + if (i < mp_irq_entries) { + int apic; + for(apic = 0; apic < nr_ioapics; apic++) { + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic) + return apic; + } + } + + return -1; +} + +/* + * Find a specific PCI IRQ entry. + * Not an __init, possibly needed by modules + */ +static int pin_2_irq(int idx, int apic, int pin); + +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) +{ + int apic, i, best_guess = -1; + + apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", + bus, slot, pin); + if (test_bit(bus, mp_bus_not_pci)) { + apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); + return -1; + } + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].mp_srcbus; + + for (apic = 0; apic < nr_ioapics; apic++) + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic || + mp_irqs[i].mp_dstapic == MP_APIC_ALL) + break; + + if (!test_bit(lbus, mp_bus_not_pci) && + !mp_irqs[i].mp_irqtype && + (bus == lbus) && + (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) { + int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq); + + if (!(apic || IO_APIC_IRQ(irq))) + continue; + + if (pin == (mp_irqs[i].mp_srcbusirq & 3)) + return irq; + /* + * Use the first all-but-pin matching entry as a + * best-guess fuzzy result for broken mptables. + */ + if (best_guess < 0) + best_guess = irq; + } + } + return best_guess; +} + +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); + +#if defined(CONFIG_EISA) || defined(CONFIG_MCA) +/* + * EISA Edge/Level control register, ELCR + */ +static int EISA_ELCR(unsigned int irq) +{ + if (irq < 16) { + unsigned int port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; + } + apic_printk(APIC_VERBOSE, KERN_INFO + "Broken MPtable reports ISA irq %d\n", irq); + return 0; +} + +#endif + +/* ISA interrupts are always polarity zero edge triggered, + * when listed as conforming in the MP table. */ + +#define default_ISA_trigger(idx) (0) +#define default_ISA_polarity(idx) (0) + +/* EISA interrupts are always polarity zero and can be edge or level + * trigger depending on the ELCR value. If an interrupt is listed as + * EISA conforming in the MP table, that means its trigger type must + * be read in from the ELCR */ + +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq)) +#define default_EISA_polarity(idx) default_ISA_polarity(idx) + +/* PCI interrupts are always polarity one level triggered, + * when listed as conforming in the MP table. */ + +#define default_PCI_trigger(idx) (1) +#define default_PCI_polarity(idx) (1) + +/* MCA interrupts are always polarity zero level triggered, + * when listed as conforming in the MP table. */ + +#define default_MCA_trigger(idx) (1) +#define default_MCA_polarity(idx) default_ISA_polarity(idx) + +static int MPBIOS_polarity(int idx) +{ + int bus = mp_irqs[idx].mp_srcbus; + int polarity; + + /* + * Determine IRQ line polarity (high active or low active): + */ + switch (mp_irqs[idx].mp_irqflag & 3) + { + case 0: /* conforms, ie. bus-type dependent polarity */ + if (test_bit(bus, mp_bus_not_pci)) + polarity = default_ISA_polarity(idx); + else + polarity = default_PCI_polarity(idx); + break; + case 1: /* high active */ + { + polarity = 0; + break; + } + case 2: /* reserved */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + case 3: /* low active */ + { + polarity = 1; + break; + } + default: /* invalid */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + } + return polarity; +} + +static int MPBIOS_trigger(int idx) +{ + int bus = mp_irqs[idx].mp_srcbus; + int trigger; + + /* + * Determine IRQ trigger mode (edge or level sensitive): + */ + switch ((mp_irqs[idx].mp_irqflag>>2) & 3) + { + case 0: /* conforms, ie. bus-type dependent */ + if (test_bit(bus, mp_bus_not_pci)) + trigger = default_ISA_trigger(idx); + else + trigger = default_PCI_trigger(idx); +#if defined(CONFIG_EISA) || defined(CONFIG_MCA) + switch (mp_bus_id_to_type[bus]) { + case MP_BUS_ISA: /* ISA pin */ + { + /* set before the switch */ + break; + } + case MP_BUS_EISA: /* EISA pin */ + { + trigger = default_EISA_trigger(idx); + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + /* set before the switch */ + break; + } + case MP_BUS_MCA: /* MCA pin */ + { + trigger = default_MCA_trigger(idx); + break; + } + default: + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 1; + break; + } + } +#endif + break; + case 1: /* edge */ + { + trigger = 0; + break; + } + case 2: /* reserved */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 1; + break; + } + case 3: /* level */ + { + trigger = 1; + break; + } + default: /* invalid */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 0; + break; + } + } + return trigger; +} + +static inline int irq_polarity(int idx) +{ + return MPBIOS_polarity(idx); +} + +static inline int irq_trigger(int idx) +{ + return MPBIOS_trigger(idx); +} + +int (*ioapic_renumber_irq)(int ioapic, int irq); +static int pin_2_irq(int idx, int apic, int pin) +{ + int irq, i; + int bus = mp_irqs[idx].mp_srcbus; + + /* + * Debugging check, we are in big trouble if this message pops up! + */ + if (mp_irqs[idx].mp_dstirq != pin) + printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); + + if (test_bit(bus, mp_bus_not_pci)) { + irq = mp_irqs[idx].mp_srcbusirq; + } else { + /* + * PCI IRQs are mapped in order + */ + i = irq = 0; + while (i < apic) + irq += nr_ioapic_registers[i++]; + irq += pin; + /* + * For MPS mode, so far only needed by ES7000 platform + */ + if (ioapic_renumber_irq) + irq = ioapic_renumber_irq(apic, irq); + } + +#ifdef CONFIG_X86_32 + /* + * PCI IRQ command line redirection. Yes, limits are hardcoded. + */ + if ((pin >= 16) && (pin <= 23)) { + if (pirq_entries[pin-16] != -1) { + if (!pirq_entries[pin-16]) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + "disabling PIRQ%d\n", pin-16); + } else { + irq = pirq_entries[pin-16]; + apic_printk(APIC_VERBOSE, KERN_DEBUG + "using PIRQ%d -> IRQ %d\n", + pin-16, irq); + } + } + } +#endif + + return irq; +} + +void lock_vector_lock(void) +{ + /* Used to the online set of cpus does not change + * during assign_irq_vector. + */ + spin_lock(&vector_lock); +} + +void unlock_vector_lock(void) +{ + spin_unlock(&vector_lock); +} + +static int __assign_irq_vector(int irq, cpumask_t mask) +{ + /* + * NOTE! The local APIC isn't very good at handling + * multiple interrupts at the same interrupt level. + * As the interrupt level is determined by taking the + * vector number and shifting that right by 4, we + * want to spread these out a bit so that they don't + * all fall in the same interrupt level. + * + * Also, we've got to be careful not to trash gate + * 0x80, because int 0x80 is hm, kind of importantish. ;) + */ + static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; + unsigned int old_vector; + int cpu; + struct irq_cfg *cfg; + + cfg = irq_cfg(irq); + + /* Only try and allocate irqs on cpus that are present */ + cpus_and(mask, mask, cpu_online_map); + + if ((cfg->move_in_progress) || cfg->move_cleanup_count) + return -EBUSY; + + old_vector = cfg->vector; + if (old_vector) { + cpumask_t tmp; + cpus_and(tmp, cfg->domain, mask); + if (!cpus_empty(tmp)) + return 0; + } + + for_each_cpu_mask_nr(cpu, mask) { + cpumask_t domain, new_mask; + int new_cpu; + int vector, offset; + + domain = vector_allocation_domain(cpu); + cpus_and(new_mask, domain, cpu_online_map); + + vector = current_vector; + offset = current_offset; +next: + vector += 8; + if (vector >= first_system_vector) { + /* If we run out of vectors on large boxen, must share them. */ + offset = (offset + 1) % 8; + vector = FIRST_DEVICE_VECTOR + offset; + } + if (unlikely(current_vector == vector)) + continue; +#ifdef CONFIG_X86_64 + if (vector == IA32_SYSCALL_VECTOR) + goto next; +#else + if (vector == SYSCALL_VECTOR) + goto next; +#endif + for_each_cpu_mask_nr(new_cpu, new_mask) + if (per_cpu(vector_irq, new_cpu)[vector] != -1) + goto next; + /* Found one! */ + current_vector = vector; + current_offset = offset; + if (old_vector) { + cfg->move_in_progress = 1; + cfg->old_domain = cfg->domain; + } + for_each_cpu_mask_nr(new_cpu, new_mask) + per_cpu(vector_irq, new_cpu)[vector] = irq; + cfg->vector = vector; + cfg->domain = domain; + return 0; + } + return -ENOSPC; +} + +static int assign_irq_vector(int irq, cpumask_t mask) +{ + int err; + unsigned long flags; + + spin_lock_irqsave(&vector_lock, flags); + err = __assign_irq_vector(irq, mask); + spin_unlock_irqrestore(&vector_lock, flags); + return err; +} + +static void __clear_irq_vector(int irq) +{ + struct irq_cfg *cfg; + cpumask_t mask; + int cpu, vector; + + cfg = irq_cfg(irq); + BUG_ON(!cfg->vector); + + vector = cfg->vector; + cpus_and(mask, cfg->domain, cpu_online_map); + for_each_cpu_mask_nr(cpu, mask) + per_cpu(vector_irq, cpu)[vector] = -1; + + cfg->vector = 0; + cpus_clear(cfg->domain); +} + +void __setup_vector_irq(int cpu) +{ + /* Initialize vector_irq on a new cpu */ + /* This function must be called with vector_lock held */ + int irq, vector; + struct irq_cfg *cfg; + + /* Mark the inuse vectors */ + for_each_irq_cfg(cfg) { + if (!cpu_isset(cpu, cfg->domain)) + continue; + vector = cfg->vector; + irq = cfg->irq; + per_cpu(vector_irq, cpu)[vector] = irq; + } + /* Mark the free vectors */ + for (vector = 0; vector < NR_VECTORS; ++vector) { + irq = per_cpu(vector_irq, cpu)[vector]; + if (irq < 0) + continue; + + cfg = irq_cfg(irq); + if (!cpu_isset(cpu, cfg->domain)) + per_cpu(vector_irq, cpu)[vector] = -1; + } +} + +static struct irq_chip ioapic_chip; +#ifdef CONFIG_INTR_REMAP +static struct irq_chip ir_ioapic_chip; +#endif + +#define IOAPIC_AUTO -1 +#define IOAPIC_EDGE 0 +#define IOAPIC_LEVEL 1 + +#ifdef CONFIG_X86_32 +static inline int IO_APIC_irq_trigger(int irq) +{ + int apic, idx, pin; + + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + idx = find_irq_entry(apic, pin, mp_INT); + if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) + return irq_trigger(idx); + } + } + /* + * nonexistent IRQs are edge default + */ + return 0; +} +#else +static inline int IO_APIC_irq_trigger(int irq) +{ + return 1; +} +#endif + +static void ioapic_register_intr(int irq, unsigned long trigger) +{ + struct irq_desc *desc; + + /* first time to use this irq_desc */ + if (irq < 16) + desc = irq_to_desc(irq); + else + desc = irq_to_desc_alloc(irq); + + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || + trigger == IOAPIC_LEVEL) + desc->status |= IRQ_LEVEL; + else + desc->status &= ~IRQ_LEVEL; + +#ifdef CONFIG_INTR_REMAP + if (irq_remapped(irq)) { + desc->status |= IRQ_MOVE_PCNTXT; + if (trigger) + set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, + handle_fasteoi_irq, + "fasteoi"); + else + set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, + handle_edge_irq, "edge"); + return; + } +#endif + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || + trigger == IOAPIC_LEVEL) + set_irq_chip_and_handler_name(irq, &ioapic_chip, + handle_fasteoi_irq, + "fasteoi"); + else + set_irq_chip_and_handler_name(irq, &ioapic_chip, + handle_edge_irq, "edge"); +} + +static int setup_ioapic_entry(int apic, int irq, + struct IO_APIC_route_entry *entry, + unsigned int destination, int trigger, + int polarity, int vector) +{ + /* + * add it to the IO-APIC irq-routing table: + */ + memset(entry,0,sizeof(*entry)); + +#ifdef CONFIG_INTR_REMAP + if (intr_remapping_enabled) { + struct intel_iommu *iommu = map_ioapic_to_ir(apic); + struct irte irte; + struct IR_IO_APIC_route_entry *ir_entry = + (struct IR_IO_APIC_route_entry *) entry; + int index; + + if (!iommu) + panic("No mapping iommu for ioapic %d\n", apic); + + index = alloc_irte(iommu, irq, 1); + if (index < 0) + panic("Failed to allocate IRTE for ioapic %d\n", apic); + + memset(&irte, 0, sizeof(irte)); + + irte.present = 1; + irte.dst_mode = INT_DEST_MODE; + irte.trigger_mode = trigger; + irte.dlvry_mode = INT_DELIVERY_MODE; + irte.vector = vector; + irte.dest_id = IRTE_DEST(destination); + + modify_irte(irq, &irte); + + ir_entry->index2 = (index >> 15) & 0x1; + ir_entry->zero = 0; + ir_entry->format = 1; + ir_entry->index = (index & 0x7fff); + } else +#endif + { + entry->delivery_mode = INT_DELIVERY_MODE; + entry->dest_mode = INT_DEST_MODE; + entry->dest = destination; + } + + entry->mask = 0; /* enable IRQ */ + entry->trigger = trigger; + entry->polarity = polarity; + entry->vector = vector; + + /* Mask level triggered irqs. + * Use IRQ_DELAYED_DISABLE for edge triggered irqs. + */ + if (trigger) + entry->mask = 1; + return 0; +} + +static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, + int trigger, int polarity) +{ + struct irq_cfg *cfg; + struct IO_APIC_route_entry entry; + cpumask_t mask; + + if (!IO_APIC_IRQ(irq)) + return; + + cfg = irq_cfg(irq); + + mask = TARGET_CPUS; + if (assign_irq_vector(irq, mask)) + return; + + cpus_and(mask, cfg->domain, mask); + + apic_printk(APIC_VERBOSE,KERN_DEBUG + "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " + "IRQ %d Mode:%i Active:%i)\n", + apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector, + irq, trigger, polarity); + + + if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry, + cpu_mask_to_apicid(mask), trigger, polarity, + cfg->vector)) { + printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", + mp_ioapics[apic].mp_apicid, pin); + __clear_irq_vector(irq); + return; + } + + ioapic_register_intr(irq, trigger); + if (irq < 16) + disable_8259A_irq(irq); + + ioapic_write_entry(apic, pin, entry); +} + +static void __init setup_IO_APIC_irqs(void) +{ + int apic, pin, idx, irq, first_notcon = 1; + + apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); + + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + + idx = find_irq_entry(apic,pin,mp_INT); + if (idx == -1) { + if (first_notcon) { + apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin); + first_notcon = 0; + } else + apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin); + continue; + } + if (!first_notcon) { + apic_printk(APIC_VERBOSE, " not connected.\n"); + first_notcon = 1; + } + + irq = pin_2_irq(idx, apic, pin); +#ifdef CONFIG_X86_32 + if (multi_timer_check(apic, irq)) + continue; +#endif + add_pin_to_irq(irq, apic, pin); + + setup_IO_APIC_irq(apic, pin, irq, + irq_trigger(idx), irq_polarity(idx)); + } + } + + if (!first_notcon) + apic_printk(APIC_VERBOSE, " not connected.\n"); +} + +/* + * Set up the timer pin, possibly with the 8259A-master behind. + */ +static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, + int vector) +{ + struct IO_APIC_route_entry entry; + +#ifdef CONFIG_INTR_REMAP + if (intr_remapping_enabled) + return; +#endif + + memset(&entry, 0, sizeof(entry)); + + /* + * We use logical delivery to get the timer IRQ + * to the first CPU. + */ + entry.dest_mode = INT_DEST_MODE; + entry.mask = 1; /* mask IRQ now */ + entry.dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.delivery_mode = INT_DELIVERY_MODE; + entry.polarity = 0; + entry.trigger = 0; + entry.vector = vector; + + /* + * The timer IRQ doesn't have to know that behind the + * scene we may have a 8259A-master in AEOI mode ... + */ + set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); + + /* + * Add it to the IO-APIC irq-routing table: + */ + ioapic_write_entry(apic, pin, entry); +} + + +__apicdebuginit(void) print_IO_APIC(void) +{ + int apic, i; + union IO_APIC_reg_00 reg_00; + union IO_APIC_reg_01 reg_01; + union IO_APIC_reg_02 reg_02; + union IO_APIC_reg_03 reg_03; + unsigned long flags; + struct irq_cfg *cfg; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); + for (i = 0; i < nr_ioapics; i++) + printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", + mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]); + + /* + * We are a bit conservative about what we expect. We have to + * know about every hardware change ASAP. + */ + printk(KERN_INFO "testing the IO APIC.......................\n"); + + for (apic = 0; apic < nr_ioapics; apic++) { + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + reg_01.raw = io_apic_read(apic, 1); + if (reg_01.bits.version >= 0x10) + reg_02.raw = io_apic_read(apic, 2); + if (reg_01.bits.version >= 0x20) + reg_03.raw = io_apic_read(apic, 3); + spin_unlock_irqrestore(&ioapic_lock, flags); + + printk("\n"); + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); + printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); + printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); + printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); + printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); + + printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); + printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); + + printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); + printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); + + /* + * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, + * but the value of reg_02 is read as the previous read register + * value, so ignore it if reg_02 == reg_01. + */ + if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { + printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); + printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); + } + + /* + * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 + * or reg_03, but the value of reg_0[23] is read as the previous read + * register value, so ignore it if reg_03 == reg_0[12]. + */ + if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && + reg_03.raw != reg_01.raw) { + printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); + printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); + } + + printk(KERN_DEBUG ".... IRQ redirection table:\n"); + + printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" + " Stat Dmod Deli Vect: \n"); + + for (i = 0; i <= reg_01.bits.entries; i++) { + struct IO_APIC_route_entry entry; + + entry = ioapic_read_entry(apic, i); + + printk(KERN_DEBUG " %02x %03X ", + i, + entry.dest + ); + + printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", + entry.mask, + entry.trigger, + entry.irr, + entry.polarity, + entry.delivery_status, + entry.dest_mode, + entry.delivery_mode, + entry.vector + ); + } + } + printk(KERN_DEBUG "IRQ to pin mappings:\n"); + for_each_irq_cfg(cfg) { + struct irq_pin_list *entry = cfg->irq_2_pin; + if (!entry) + continue; + printk(KERN_DEBUG "IRQ%d ", cfg->irq); + for (;;) { + printk("-> %d:%d", entry->apic, entry->pin); + if (!entry->next) + break; + entry = entry->next; + } + printk("\n"); + } + + printk(KERN_INFO ".................................... done.\n"); + + return; +} + +__apicdebuginit(void) print_APIC_bitfield(int base) +{ + unsigned int v; + int i, j; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); + for (i = 0; i < 8; i++) { + v = apic_read(base + i*0x10); + for (j = 0; j < 32; j++) { + if (v & (1< 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + + v = apic_read(APIC_ESR); + printk(KERN_DEBUG "... APIC ESR: %08x\n", v); + } + + icr = apic_icr_read(); + printk(KERN_DEBUG "... APIC ICR: %08x\n", icr); + printk(KERN_DEBUG "... APIC ICR2: %08x\n", icr >> 32); + + v = apic_read(APIC_LVTT); + printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); + + if (maxlvt > 3) { /* PC is LVT#4. */ + v = apic_read(APIC_LVTPC); + printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); + } + v = apic_read(APIC_LVT0); + printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); + v = apic_read(APIC_LVT1); + printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); + + if (maxlvt > 2) { /* ERR is LVT#3. */ + v = apic_read(APIC_LVTERR); + printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); + } + + v = apic_read(APIC_TMICT); + printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); + v = apic_read(APIC_TMCCT); + printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); + v = apic_read(APIC_TDCR); + printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); + printk("\n"); +} + +__apicdebuginit(void) print_all_local_APICs(void) +{ + on_each_cpu(print_local_APIC, NULL, 1); +} + +__apicdebuginit(void) print_PIC(void) +{ + unsigned int v; + unsigned long flags; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "\nprinting PIC contents\n"); + + spin_lock_irqsave(&i8259A_lock, flags); + + v = inb(0xa1) << 8 | inb(0x21); + printk(KERN_DEBUG "... PIC IMR: %04x\n", v); + + v = inb(0xa0) << 8 | inb(0x20); + printk(KERN_DEBUG "... PIC IRR: %04x\n", v); + + outb(0x0b,0xa0); + outb(0x0b,0x20); + v = inb(0xa0) << 8 | inb(0x20); + outb(0x0a,0xa0); + outb(0x0a,0x20); + + spin_unlock_irqrestore(&i8259A_lock, flags); + + printk(KERN_DEBUG "... PIC ISR: %04x\n", v); + + v = inb(0x4d1) << 8 | inb(0x4d0); + printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); +} + +__apicdebuginit(int) print_all_ICs(void) +{ + print_PIC(); + print_all_local_APICs(); + print_IO_APIC(); + + return 0; +} + +fs_initcall(print_all_ICs); + + +/* Where if anywhere is the i8259 connect in external int mode */ +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; + +void __init enable_IO_APIC(void) +{ + union IO_APIC_reg_01 reg_01; + int i8259_apic, i8259_pin; + int apic; + unsigned long flags; + +#ifdef CONFIG_X86_32 + int i; + if (!pirqs_enabled) + for (i = 0; i < MAX_PIRQS; i++) + pirq_entries[i] = -1; +#endif + + /* + * The number of IO-APIC IRQ registers (== #pins): + */ + for (apic = 0; apic < nr_ioapics; apic++) { + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(apic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + nr_ioapic_registers[apic] = reg_01.bits.entries+1; + } + for(apic = 0; apic < nr_ioapics; apic++) { + int pin; + /* See if any of the pins is in ExtINT mode */ + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + struct IO_APIC_route_entry entry; + entry = ioapic_read_entry(apic, pin); + + /* If the interrupt line is enabled and in ExtInt mode + * I have found the pin where the i8259 is connected. + */ + if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { + ioapic_i8259.apic = apic; + ioapic_i8259.pin = pin; + goto found_i8259; + } + } + } + found_i8259: + /* Look to see what if the MP table has reported the ExtINT */ + /* If we could not find the appropriate pin by looking at the ioapic + * the i8259 probably is not connected the ioapic but give the + * mptable a chance anyway. + */ + i8259_pin = find_isa_irq_pin(0, mp_ExtINT); + i8259_apic = find_isa_irq_apic(0, mp_ExtINT); + /* Trust the MP table if nothing is setup in the hardware */ + if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { + printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); + ioapic_i8259.pin = i8259_pin; + ioapic_i8259.apic = i8259_apic; + } + /* Complain if the MP table and the hardware disagree */ + if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && + (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) + { + printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); + } + + /* + * Do not trust the IO-APIC being empty at bootup + */ + clear_IO_APIC(); +} + +/* + * Not an __init, needed by the reboot code + */ +void disable_IO_APIC(void) +{ + /* + * Clear the IO-APIC before rebooting: + */ + clear_IO_APIC(); + + /* + * If the i8259 is routed through an IOAPIC + * Put that IOAPIC in virtual wire mode + * so legacy interrupts can be delivered. + */ + if (ioapic_i8259.pin != -1) { + struct IO_APIC_route_entry entry; + + memset(&entry, 0, sizeof(entry)); + entry.mask = 0; /* Enabled */ + entry.trigger = 0; /* Edge */ + entry.irr = 0; + entry.polarity = 0; /* High */ + entry.delivery_status = 0; + entry.dest_mode = 0; /* Physical */ + entry.delivery_mode = dest_ExtINT; /* ExtInt */ + entry.vector = 0; + entry.dest = read_apic_id(); + + /* + * Add it to the IO-APIC irq-routing table: + */ + ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); + } + + disconnect_bsp_APIC(ioapic_i8259.pin != -1); +} + +#ifdef CONFIG_X86_32 +/* + * function to set the IO-APIC physical IDs based on the + * values stored in the MPC table. + * + * by Matt Domsch Tue Dec 21 12:25:05 CST 1999 + */ + +static void __init setup_ioapic_ids_from_mpc(void) +{ + union IO_APIC_reg_00 reg_00; + physid_mask_t phys_id_present_map; + int apic; + int i; + unsigned char old_id; + unsigned long flags; + + if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids()) + return; + + /* + * Don't check I/O APIC IDs for xAPIC systems. They have + * no meaning without the serial APIC bus. + */ + if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + return; + /* + * This is broken; anything with a real cpu count has to + * circumvent this idiocy regardless. + */ + phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); + + /* + * Set the IOAPIC ID to the value stored in the MPC table. + */ + for (apic = 0; apic < nr_ioapics; apic++) { + + /* Read the register 0 value */ + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + old_id = mp_ioapics[apic].mp_apicid; + + if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) { + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", + apic, mp_ioapics[apic].mp_apicid); + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", + reg_00.bits.ID); + mp_ioapics[apic].mp_apicid = reg_00.bits.ID; + } + + /* + * Sanity check, is the ID really free? Every APIC in a + * system must have a unique ID or we get lots of nice + * 'stuck on smp_invalidate_needed IPI wait' messages. + */ + if (check_apicid_used(phys_id_present_map, + mp_ioapics[apic].mp_apicid)) { + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", + apic, mp_ioapics[apic].mp_apicid); + for (i = 0; i < get_physical_broadcast(); i++) + if (!physid_isset(i, phys_id_present_map)) + break; + if (i >= get_physical_broadcast()) + panic("Max APIC ID exceeded!\n"); + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", + i); + physid_set(i, phys_id_present_map); + mp_ioapics[apic].mp_apicid = i; + } else { + physid_mask_t tmp; + tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid); + apic_printk(APIC_VERBOSE, "Setting %d in the " + "phys_id_present_map\n", + mp_ioapics[apic].mp_apicid); + physids_or(phys_id_present_map, phys_id_present_map, tmp); + } + + + /* + * We need to adjust the IRQ routing table + * if the ID changed. + */ + if (old_id != mp_ioapics[apic].mp_apicid) + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].mp_dstapic == old_id) + mp_irqs[i].mp_dstapic + = mp_ioapics[apic].mp_apicid; + + /* + * Read the right value from the MPC table and + * write it into the ID register. + */ + apic_printk(APIC_VERBOSE, KERN_INFO + "...changing IO-APIC physical APIC ID to %d ...", + mp_ioapics[apic].mp_apicid); + + reg_00.bits.ID = mp_ioapics[apic].mp_apicid; + spin_lock_irqsave(&ioapic_lock, flags); + + /* + * Sanity check + */ + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid) + printk("could not set ID!\n"); + else + apic_printk(APIC_VERBOSE, " ok.\n"); + } +} +#endif + +int no_timer_check __initdata; + +static int __init notimercheck(char *s) +{ + no_timer_check = 1; + return 1; +} +__setup("no_timer_check", notimercheck); + +/* + * There is a nasty bug in some older SMP boards, their mptable lies + * about the timer IRQ. We do the following to work around the situation: + * + * - timer IRQ defaults to IO-APIC IRQ + * - if this function detects that timer IRQs are defunct, then we fall + * back to ISA timer IRQs + */ +static int __init timer_irq_works(void) +{ + unsigned long t1 = jiffies; + unsigned long flags; + + if (no_timer_check) + return 1; + + local_save_flags(flags); + local_irq_enable(); + /* Let ten ticks pass... */ + mdelay((10 * 1000) / HZ); + local_irq_restore(flags); + + /* + * Expect a few ticks at least, to be sure some possible + * glue logic does not lock up after one or two first + * ticks in a non-ExtINT mode. Also the local APIC + * might have cached one ExtINT interrupt. Finally, at + * least one tick may be lost due to delays. + */ + + /* jiffies wrap? */ + if (time_after(jiffies, t1 + 4)) + return 1; + return 0; +} + +/* + * In the SMP+IOAPIC case it might happen that there are an unspecified + * number of pending IRQ events unhandled. These cases are very rare, + * so we 'resend' these IRQs via IPIs, to the same CPU. It's much + * better to do it this way as thus we do not have to be aware of + * 'pending' interrupts in the IRQ path, except at this point. + */ +/* + * Edge triggered needs to resend any interrupt + * that was delayed but this is now handled in the device + * independent code. + */ + +/* + * Starting up a edge-triggered IO-APIC interrupt is + * nasty - we need to make sure that we get the edge. + * If it is already asserted for some reason, we need + * return 1 to indicate that is was pending. + * + * This is not complete - we should be able to fake + * an edge even if it isn't on the 8259A... + */ + +static unsigned int startup_ioapic_irq(unsigned int irq) +{ + int was_pending = 0; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + if (irq < 16) { + disable_8259A_irq(irq); + if (i8259A_irq_pending(irq)) + was_pending = 1; + } + __unmask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return was_pending; +} + +#ifdef CONFIG_X86_64 +static int ioapic_retrigger_irq(unsigned int irq) +{ + + struct irq_cfg *cfg = irq_cfg(irq); + unsigned long flags; + + spin_lock_irqsave(&vector_lock, flags); + send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector); + spin_unlock_irqrestore(&vector_lock, flags); + + return 1; +} +#else +static int ioapic_retrigger_irq(unsigned int irq) +{ + send_IPI_self(irq_cfg(irq)->vector); + + return 1; +} +#endif + +/* + * Level and edge triggered IO-APIC interrupts need different handling, + * so we use two separate IRQ descriptors. Edge triggered IRQs can be + * handled with the level-triggered descriptor, but that one has slightly + * more overhead. Level-triggered interrupts cannot be handled with the + * edge-triggered handler, without risking IRQ storms and other ugly + * races. + */ + +#ifdef CONFIG_SMP + +#ifdef CONFIG_INTR_REMAP +static void ir_irq_migration(struct work_struct *work); + +static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration); + +/* + * Migrate the IO-APIC irq in the presence of intr-remapping. + * + * For edge triggered, irq migration is a simple atomic update(of vector + * and cpu destination) of IRTE and flush the hardware cache. + * + * For level triggered, we need to modify the io-apic RTE aswell with the update + * vector information, along with modifying IRTE with vector and destination. + * So irq migration for level triggered is little bit more complex compared to + * edge triggered migration. But the good news is, we use the same algorithm + * for level triggered migration as we have today, only difference being, + * we now initiate the irq migration from process context instead of the + * interrupt context. + * + * In future, when we do a directed EOI (combined with cpu EOI broadcast + * suppression) to the IO-APIC, level triggered irq migration will also be + * as simple as edge triggered migration and we can do the irq migration + * with a simple atomic update to IO-APIC RTE. + */ +static void migrate_ioapic_irq(int irq, cpumask_t mask) +{ + struct irq_cfg *cfg; + struct irq_desc *desc; + cpumask_t tmp, cleanup_mask; + struct irte irte; + int modify_ioapic_rte; + unsigned int dest; + unsigned long flags; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (get_irte(irq, &irte)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + desc = irq_to_desc(irq); + modify_ioapic_rte = desc->status & IRQ_LEVEL; + if (modify_ioapic_rte) { + spin_lock_irqsave(&ioapic_lock, flags); + __target_IO_APIC_irq(irq, dest, cfg->vector); + spin_unlock_irqrestore(&ioapic_lock, flags); + } + + irte.vector = cfg->vector; + irte.dest_id = IRTE_DEST(dest); + + /* + * Modified the IRTE and flushes the Interrupt entry cache. + */ + modify_irte(irq, &irte); + + if (cfg->move_in_progress) { + cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); + cfg->move_cleanup_count = cpus_weight(cleanup_mask); + send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + cfg->move_in_progress = 0; + } + + desc->affinity = mask; +} + +static int migrate_irq_remapped_level(int irq) +{ + int ret = -1; + struct irq_desc *desc = irq_to_desc(irq); + + mask_IO_APIC_irq(irq); + + if (io_apic_level_ack_pending(irq)) { + /* + * Interrupt in progress. Migrating irq now will change the + * vector information in the IO-APIC RTE and that will confuse + * the EOI broadcast performed by cpu. + * So, delay the irq migration to the next instance. + */ + schedule_delayed_work(&ir_migration_work, 1); + goto unmask; + } + + /* everthing is clear. we have right of way */ + migrate_ioapic_irq(irq, desc->pending_mask); + + ret = 0; + desc->status &= ~IRQ_MOVE_PENDING; + cpus_clear(desc->pending_mask); + +unmask: + unmask_IO_APIC_irq(irq); + return ret; +} + +static void ir_irq_migration(struct work_struct *work) +{ + unsigned int irq; + struct irq_desc *desc; + + for_each_irq_desc(irq, desc) { + if (desc->status & IRQ_MOVE_PENDING) { + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + if (!desc->chip->set_affinity || + !(desc->status & IRQ_MOVE_PENDING)) { + desc->status &= ~IRQ_MOVE_PENDING; + spin_unlock_irqrestore(&desc->lock, flags); + continue; + } + + desc->chip->set_affinity(irq, desc->pending_mask); + spin_unlock_irqrestore(&desc->lock, flags); + } + } +} + +/* + * Migrates the IRQ destination in the process context. + */ +static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (desc->status & IRQ_LEVEL) { + desc->status |= IRQ_MOVE_PENDING; + desc->pending_mask = mask; + migrate_irq_remapped_level(irq); + return; + } + + migrate_ioapic_irq(irq, mask); +} +#endif + +asmlinkage void smp_irq_move_cleanup_interrupt(void) +{ + unsigned vector, me; + ack_APIC_irq(); +#ifdef CONFIG_X86_64 + exit_idle(); +#endif + irq_enter(); + + me = smp_processor_id(); + for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { + unsigned int irq; + struct irq_desc *desc; + struct irq_cfg *cfg; + irq = __get_cpu_var(vector_irq)[vector]; + + desc = irq_to_desc(irq); + if (!desc) + continue; + + cfg = irq_cfg(irq); + spin_lock(&desc->lock); + if (!cfg->move_cleanup_count) + goto unlock; + + if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) + goto unlock; + + __get_cpu_var(vector_irq)[vector] = -1; + cfg->move_cleanup_count--; +unlock: + spin_unlock(&desc->lock); + } + + irq_exit(); +} + +static void irq_complete_move(unsigned int irq) +{ + struct irq_cfg *cfg = irq_cfg(irq); + unsigned vector, me; + + if (likely(!cfg->move_in_progress)) + return; + + vector = ~get_irq_regs()->orig_ax; + me = smp_processor_id(); + if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { + cpumask_t cleanup_mask; + + cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); + cfg->move_cleanup_count = cpus_weight(cleanup_mask); + send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + cfg->move_in_progress = 0; + } +} +#else +static inline void irq_complete_move(unsigned int irq) {} +#endif +#ifdef CONFIG_INTR_REMAP +static void ack_x2apic_level(unsigned int irq) +{ + ack_x2APIC_irq(); +} + +static void ack_x2apic_edge(unsigned int irq) +{ + ack_x2APIC_irq(); +} +#endif + +static void ack_apic_edge(unsigned int irq) +{ + irq_complete_move(irq); + move_native_irq(irq); + ack_APIC_irq(); +} + +#ifdef CONFIG_X86_64 +static void ack_apic_level(unsigned int irq) +{ + int do_unmask_irq = 0; + + irq_complete_move(irq); +#ifdef CONFIG_GENERIC_PENDING_IRQ + /* If we are moving the irq we need to mask it */ + if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { + do_unmask_irq = 1; + mask_IO_APIC_irq(irq); + } +#endif + + /* + * We must acknowledge the irq before we move it or the acknowledge will + * not propagate properly. + */ + ack_APIC_irq(); + + /* Now we can move and renable the irq */ + if (unlikely(do_unmask_irq)) { + /* Only migrate the irq if the ack has been received. + * + * On rare occasions the broadcast level triggered ack gets + * delayed going to ioapics, and if we reprogram the + * vector while Remote IRR is still set the irq will never + * fire again. + * + * To prevent this scenario we read the Remote IRR bit + * of the ioapic. This has two effects. + * - On any sane system the read of the ioapic will + * flush writes (and acks) going to the ioapic from + * this cpu. + * - We get to see if the ACK has actually been delivered. + * + * Based on failed experiments of reprogramming the + * ioapic entry from outside of irq context starting + * with masking the ioapic entry and then polling until + * Remote IRR was clear before reprogramming the + * ioapic I don't trust the Remote IRR bit to be + * completey accurate. + * + * However there appears to be no other way to plug + * this race, so if the Remote IRR bit is not + * accurate and is causing problems then it is a hardware bug + * and you can go talk to the chipset vendor about it. + */ + if (!io_apic_level_ack_pending(irq)) + move_masked_irq(irq); + unmask_IO_APIC_irq(irq); + } +} +#else +atomic_t irq_mis_count; +static void ack_apic_level(unsigned int irq) +{ + unsigned long v; + int i; + + irq_complete_move(irq); + move_native_irq(irq); + /* + * It appears there is an erratum which affects at least version 0x11 + * of I/O APIC (that's the 82093AA and cores integrated into various + * chipsets). Under certain conditions a level-triggered interrupt is + * erroneously delivered as edge-triggered one but the respective IRR + * bit gets set nevertheless. As a result the I/O unit expects an EOI + * message but it will never arrive and further interrupts are blocked + * from the source. The exact reason is so far unknown, but the + * phenomenon was observed when two consecutive interrupt requests + * from a given source get delivered to the same CPU and the source is + * temporarily disabled in between. + * + * A workaround is to simulate an EOI message manually. We achieve it + * by setting the trigger mode to edge and then to level when the edge + * trigger mode gets detected in the TMR of a local APIC for a + * level-triggered interrupt. We mask the source for the time of the + * operation to prevent an edge-triggered interrupt escaping meanwhile. + * The idea is from Manfred Spraul. --macro + */ + i = irq_cfg(irq)->vector; + + v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); + + ack_APIC_irq(); + + if (!(v & (1 << (i & 0x1f)))) { + atomic_inc(&irq_mis_count); + spin_lock(&ioapic_lock); + __mask_and_edge_IO_APIC_irq(irq); + __unmask_and_level_IO_APIC_irq(irq); + spin_unlock(&ioapic_lock); + } +} +#endif + +static struct irq_chip ioapic_chip __read_mostly = { + .name = "IO-APIC", + .startup = startup_ioapic_irq, + .mask = mask_IO_APIC_irq, + .unmask = unmask_IO_APIC_irq, + .ack = ack_apic_edge, + .eoi = ack_apic_level, +#ifdef CONFIG_SMP + .set_affinity = set_ioapic_affinity_irq, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +#ifdef CONFIG_INTR_REMAP +static struct irq_chip ir_ioapic_chip __read_mostly = { + .name = "IR-IO-APIC", + .startup = startup_ioapic_irq, + .mask = mask_IO_APIC_irq, + .unmask = unmask_IO_APIC_irq, + .ack = ack_x2apic_edge, + .eoi = ack_x2apic_level, +#ifdef CONFIG_SMP + .set_affinity = set_ir_ioapic_affinity_irq, +#endif + .retrigger = ioapic_retrigger_irq, +}; +#endif + +static inline void init_IO_APIC_traps(void) +{ + int irq; + struct irq_desc *desc; + struct irq_cfg *cfg; + + /* + * NOTE! The local APIC isn't very good at handling + * multiple interrupts at the same interrupt level. + * As the interrupt level is determined by taking the + * vector number and shifting that right by 4, we + * want to spread these out a bit so that they don't + * all fall in the same interrupt level. + * + * Also, we've got to be careful not to trash gate + * 0x80, because int 0x80 is hm, kind of importantish. ;) + */ + for_each_irq_cfg(cfg) { + irq = cfg->irq; + if (IO_APIC_IRQ(irq) && !cfg->vector) { + /* + * Hmm.. We don't have an entry for this, + * so default to an old-fashioned 8259 + * interrupt if we can.. + */ + if (irq < 16) + make_8259A_irq(irq); + else { + desc = irq_to_desc(irq); + /* Strange. Oh, well.. */ + desc->chip = &no_irq_chip; + } + } + } +} + +/* + * The local APIC irq-chip implementation: + */ + +static void mask_lapic_irq(unsigned int irq) +{ + unsigned long v; + + v = apic_read(APIC_LVT0); + apic_write(APIC_LVT0, v | APIC_LVT_MASKED); +} + +static void unmask_lapic_irq(unsigned int irq) +{ + unsigned long v; + + v = apic_read(APIC_LVT0); + apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); +} + +static void ack_lapic_irq (unsigned int irq) +{ + ack_APIC_irq(); +} + +static struct irq_chip lapic_chip __read_mostly = { + .name = "local-APIC", + .mask = mask_lapic_irq, + .unmask = unmask_lapic_irq, + .ack = ack_lapic_irq, +}; + +static void lapic_register_intr(int irq) +{ + struct irq_desc *desc; + + desc = irq_to_desc(irq); + desc->status &= ~IRQ_LEVEL; + set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, + "edge"); +} + +static void __init setup_nmi(void) +{ + /* + * Dirty trick to enable the NMI watchdog ... + * We put the 8259A master into AEOI mode and + * unmask on all local APICs LVT0 as NMI. + * + * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') + * is from Maciej W. Rozycki - so we do not have to EOI from + * the NMI handler or the timer interrupt. + */ + apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); + + enable_NMI_through_LVT0(); + + apic_printk(APIC_VERBOSE, " done.\n"); +} + +/* + * This looks a bit hackish but it's about the only one way of sending + * a few INTA cycles to 8259As and any associated glue logic. ICR does + * not support the ExtINT mode, unfortunately. We need to send these + * cycles as some i82489DX-based boards have glue logic that keeps the + * 8259A interrupt line asserted until INTA. --macro + */ +static inline void __init unlock_ExtINT_logic(void) +{ + int apic, pin, i; + struct IO_APIC_route_entry entry0, entry1; + unsigned char save_control, save_freq_select; + + pin = find_isa_irq_pin(8, mp_INT); + if (pin == -1) { + WARN_ON_ONCE(1); + return; + } + apic = find_isa_irq_apic(8, mp_INT); + if (apic == -1) { + WARN_ON_ONCE(1); + return; + } + + entry0 = ioapic_read_entry(apic, pin); + clear_IO_APIC_pin(apic, pin); + + memset(&entry1, 0, sizeof(entry1)); + + entry1.dest_mode = 0; /* physical delivery */ + entry1.mask = 0; /* unmask IRQ now */ + entry1.dest = hard_smp_processor_id(); + entry1.delivery_mode = dest_ExtINT; + entry1.polarity = entry0.polarity; + entry1.trigger = 0; + entry1.vector = 0; + + ioapic_write_entry(apic, pin, entry1); + + save_control = CMOS_READ(RTC_CONTROL); + save_freq_select = CMOS_READ(RTC_FREQ_SELECT); + CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, + RTC_FREQ_SELECT); + CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); + + i = 100; + while (i-- > 0) { + mdelay(10); + if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) + i -= 10; + } + + CMOS_WRITE(save_control, RTC_CONTROL); + CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); + clear_IO_APIC_pin(apic, pin); + + ioapic_write_entry(apic, pin, entry0); +} + +static int disable_timer_pin_1 __initdata; +/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ +static int __init disable_timer_pin_setup(char *arg) +{ + disable_timer_pin_1 = 1; + return 0; +} +early_param("disable_timer_pin_1", disable_timer_pin_setup); + +int timer_through_8259 __initdata; + +/* + * This code may look a bit paranoid, but it's supposed to cooperate with + * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ + * is so screwy. Thanks to Brian Perkins for testing/hacking this beast + * fanatically on his truly buggy board. + * + * FIXME: really need to revamp this for all platforms. + */ +static inline void __init check_timer(void) +{ + struct irq_cfg *cfg = irq_cfg(0); + int apic1, pin1, apic2, pin2; + unsigned long flags; + unsigned int ver; + int no_pin1 = 0; + + local_irq_save(flags); + + ver = apic_read(APIC_LVR); + ver = GET_APIC_VERSION(ver); + + /* + * get/set the timer IRQ vector: + */ + disable_8259A_irq(0); + assign_irq_vector(0, TARGET_CPUS); + + /* + * As IRQ0 is to be enabled in the 8259A, the virtual + * wire has to be disabled in the local APIC. Also + * timer interrupts need to be acknowledged manually in + * the 8259A for the i82489DX when using the NMI + * watchdog as that APIC treats NMIs as level-triggered. + * The AEOI mode will finish them in the 8259A + * automatically. + */ + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); + init_8259A(1); +#ifdef CONFIG_X86_32 + timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); +#endif + + pin1 = find_isa_irq_pin(0, mp_INT); + apic1 = find_isa_irq_apic(0, mp_INT); + pin2 = ioapic_i8259.pin; + apic2 = ioapic_i8259.apic; + + apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " + "apic1=%d pin1=%d apic2=%d pin2=%d\n", + cfg->vector, apic1, pin1, apic2, pin2); + + /* + * Some BIOS writers are clueless and report the ExtINTA + * I/O APIC input from the cascaded 8259A as the timer + * interrupt input. So just in case, if only one pin + * was found above, try it both directly and through the + * 8259A. + */ + if (pin1 == -1) { +#ifdef CONFIG_INTR_REMAP + if (intr_remapping_enabled) + panic("BIOS bug: timer not connected to IO-APIC"); +#endif + pin1 = pin2; + apic1 = apic2; + no_pin1 = 1; + } else if (pin2 == -1) { + pin2 = pin1; + apic2 = apic1; + } + + if (pin1 != -1) { + /* + * Ok, does IRQ0 through the IOAPIC work? + */ + if (no_pin1) { + add_pin_to_irq(0, apic1, pin1); + setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); + } + unmask_IO_APIC_irq(0); + if (timer_irq_works()) { + if (nmi_watchdog == NMI_IO_APIC) { + setup_nmi(); + enable_8259A_irq(0); + } + if (disable_timer_pin_1 > 0) + clear_IO_APIC_pin(0, pin1); + goto out; + } +#ifdef CONFIG_INTR_REMAP + if (intr_remapping_enabled) + panic("timer doesn't work through Interrupt-remapped IO-APIC"); +#endif + clear_IO_APIC_pin(apic1, pin1); + if (!no_pin1) + apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " + "8254 timer not connected to IO-APIC\n"); + + apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer " + "(IRQ0) through the 8259A ...\n"); + apic_printk(APIC_QUIET, KERN_INFO + "..... (found apic %d pin %d) ...\n", apic2, pin2); + /* + * legacy devices should be connected to IO APIC #0 + */ + replace_pin_at_irq(0, apic1, pin1, apic2, pin2); + setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); + unmask_IO_APIC_irq(0); + enable_8259A_irq(0); + if (timer_irq_works()) { + apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); + timer_through_8259 = 1; + if (nmi_watchdog == NMI_IO_APIC) { + disable_8259A_irq(0); + setup_nmi(); + enable_8259A_irq(0); + } + goto out; + } + /* + * Cleanup, just in case ... + */ + disable_8259A_irq(0); + clear_IO_APIC_pin(apic2, pin2); + apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); + } + + if (nmi_watchdog == NMI_IO_APIC) { + apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work " + "through the IO-APIC - disabling NMI Watchdog!\n"); + nmi_watchdog = NMI_NONE; + } +#ifdef CONFIG_X86_32 + timer_ack = 0; +#endif + + apic_printk(APIC_QUIET, KERN_INFO + "...trying to set up timer as Virtual Wire IRQ...\n"); + + lapic_register_intr(0); + apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ + enable_8259A_irq(0); + + if (timer_irq_works()) { + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); + goto out; + } + disable_8259A_irq(0); + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); + apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); + + apic_printk(APIC_QUIET, KERN_INFO + "...trying to set up timer as ExtINT IRQ...\n"); + + init_8259A(0); + make_8259A_irq(0); + apic_write(APIC_LVT0, APIC_DM_EXTINT); + + unlock_ExtINT_logic(); + + if (timer_irq_works()) { + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); + goto out; + } + apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); + panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " + "report. Then try booting with the 'noapic' option.\n"); +out: + local_irq_restore(flags); +} + +/* + * Traditionally ISA IRQ2 is the cascade IRQ, and is not available + * to devices. However there may be an I/O APIC pin available for + * this interrupt regardless. The pin may be left unconnected, but + * typically it will be reused as an ExtINT cascade interrupt for + * the master 8259A. In the MPS case such a pin will normally be + * reported as an ExtINT interrupt in the MP table. With ACPI + * there is no provision for ExtINT interrupts, and in the absence + * of an override it would be treated as an ordinary ISA I/O APIC + * interrupt, that is edge-triggered and unmasked by default. We + * used to do this, but it caused problems on some systems because + * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using + * the same ExtINT cascade interrupt to drive the local APIC of the + * bootstrap processor. Therefore we refrain from routing IRQ2 to + * the I/O APIC in all cases now. No actual device should request + * it anyway. --macro + */ +#define PIC_IRQS (1 << PIC_CASCADE_IR) + +void __init setup_IO_APIC(void) +{ + +#ifdef CONFIG_X86_32 + enable_IO_APIC(); +#else + /* + * calling enable_IO_APIC() is moved to setup_local_APIC for BP + */ +#endif + + io_apic_irqs = ~PIC_IRQS; + + apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); + /* + * Set up IO-APIC IRQ routing. + */ +#ifdef CONFIG_X86_32 + if (!acpi_ioapic) + setup_ioapic_ids_from_mpc(); +#endif + sync_Arb_IDs(); + setup_IO_APIC_irqs(); + init_IO_APIC_traps(); + check_timer(); +} + +/* + * Called after all the initialization is done. If we didnt find any + * APIC bugs then we can allow the modify fast path + */ + +static int __init io_apic_bug_finalize(void) +{ + if (sis_apic_bug == -1) + sis_apic_bug = 0; + return 0; +} + +late_initcall(io_apic_bug_finalize); + +struct sysfs_ioapic_data { + struct sys_device dev; + struct IO_APIC_route_entry entry[0]; +}; +static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; + +static int ioapic_suspend(struct sys_device *dev, pm_message_t state) +{ + struct IO_APIC_route_entry *entry; + struct sysfs_ioapic_data *data; + int i; + + data = container_of(dev, struct sysfs_ioapic_data, dev); + entry = data->entry; + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) + *entry = ioapic_read_entry(dev->id, i); + + return 0; +} + +static int ioapic_resume(struct sys_device *dev) +{ + struct IO_APIC_route_entry *entry; + struct sysfs_ioapic_data *data; + unsigned long flags; + union IO_APIC_reg_00 reg_00; + int i; + + data = container_of(dev, struct sysfs_ioapic_data, dev); + entry = data->entry; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(dev->id, 0); + if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) { + reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid; + io_apic_write(dev->id, 0, reg_00.raw); + } + spin_unlock_irqrestore(&ioapic_lock, flags); + for (i = 0; i < nr_ioapic_registers[dev->id]; i++) + ioapic_write_entry(dev->id, i, entry[i]); + + return 0; +} + +static struct sysdev_class ioapic_sysdev_class = { + .name = "ioapic", + .suspend = ioapic_suspend, + .resume = ioapic_resume, +}; + +static int __init ioapic_init_sysfs(void) +{ + struct sys_device * dev; + int i, size, error; + + error = sysdev_class_register(&ioapic_sysdev_class); + if (error) + return error; + + for (i = 0; i < nr_ioapics; i++ ) { + size = sizeof(struct sys_device) + nr_ioapic_registers[i] + * sizeof(struct IO_APIC_route_entry); + mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL); + if (!mp_ioapic_data[i]) { + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); + continue; + } + dev = &mp_ioapic_data[i]->dev; + dev->id = i; + dev->cls = &ioapic_sysdev_class; + error = sysdev_register(dev); + if (error) { + kfree(mp_ioapic_data[i]); + mp_ioapic_data[i] = NULL; + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); + continue; + } + } + + return 0; +} + +device_initcall(ioapic_init_sysfs); + +/* + * Dynamic irq allocate and deallocation + */ +unsigned int create_irq_nr(unsigned int irq_want) +{ + /* Allocate an unused irq */ + unsigned int irq; + unsigned int new; + unsigned long flags; + struct irq_cfg *cfg_new; + +#ifndef CONFIG_HAVE_SPARSE_IRQ + irq_want = nr_irqs - 1; +#endif + + irq = 0; + spin_lock_irqsave(&vector_lock, flags); + for (new = irq_want; new > 0; new--) { + if (platform_legacy_irq(new)) + continue; + cfg_new = irq_cfg(new); + if (cfg_new && cfg_new->vector != 0) + continue; + /* check if need to create one */ + if (!cfg_new) + cfg_new = irq_cfg_alloc(new); + if (__assign_irq_vector(new, TARGET_CPUS) == 0) + irq = new; + break; + } + spin_unlock_irqrestore(&vector_lock, flags); + + if (irq > 0) { + dynamic_irq_init(irq); + } + return irq; +} + +int create_irq(void) +{ + int irq; + + irq = create_irq_nr(nr_irqs - 1); + + if (irq == 0) + irq = -1; + + return irq; +} + +void destroy_irq(unsigned int irq) +{ + unsigned long flags; + + dynamic_irq_cleanup(irq); + +#ifdef CONFIG_INTR_REMAP + free_irte(irq); +#endif + spin_lock_irqsave(&vector_lock, flags); + __clear_irq_vector(irq); + spin_unlock_irqrestore(&vector_lock, flags); +} + +/* + * MSI message composition + */ +#ifdef CONFIG_PCI_MSI +static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) +{ + struct irq_cfg *cfg; + int err; + unsigned dest; + cpumask_t tmp; + + tmp = TARGET_CPUS; + err = assign_irq_vector(irq, tmp); + if (err) + return err; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, tmp); + dest = cpu_mask_to_apicid(tmp); + +#ifdef CONFIG_INTR_REMAP + if (irq_remapped(irq)) { + struct irte irte; + int ir_index; + u16 sub_handle; + + ir_index = map_irq_to_irte_handle(irq, &sub_handle); + BUG_ON(ir_index == -1); + + memset (&irte, 0, sizeof(irte)); + + irte.present = 1; + irte.dst_mode = INT_DEST_MODE; + irte.trigger_mode = 0; /* edge */ + irte.dlvry_mode = INT_DELIVERY_MODE; + irte.vector = cfg->vector; + irte.dest_id = IRTE_DEST(dest); + + modify_irte(irq, &irte); + + msg->address_hi = MSI_ADDR_BASE_HI; + msg->data = sub_handle; + msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT | + MSI_ADDR_IR_SHV | + MSI_ADDR_IR_INDEX1(ir_index) | + MSI_ADDR_IR_INDEX2(ir_index); + } else +#endif + { + msg->address_hi = MSI_ADDR_BASE_HI; + msg->address_lo = + MSI_ADDR_BASE_LO | + ((INT_DEST_MODE == 0) ? + MSI_ADDR_DEST_MODE_PHYSICAL: + MSI_ADDR_DEST_MODE_LOGICAL) | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + MSI_ADDR_REDIRECTION_CPU: + MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_DEST_ID(dest); + + msg->data = + MSI_DATA_TRIGGER_EDGE | + MSI_DATA_LEVEL_ASSERT | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + MSI_DATA_DELIVERY_FIXED: + MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_VECTOR(cfg->vector); + } + return err; +} + +#ifdef CONFIG_SMP +static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) +{ + struct irq_cfg *cfg; + struct msi_msg msg; + unsigned int dest; + cpumask_t tmp; + struct irq_desc *desc; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + read_msi_msg(irq, &msg); + + msg.data &= ~MSI_DATA_VECTOR_MASK; + msg.data |= MSI_DATA_VECTOR(cfg->vector); + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg.address_lo |= MSI_ADDR_DEST_ID(dest); + + write_msi_msg(irq, &msg); + desc = irq_to_desc(irq); + desc->affinity = mask; +} + +#ifdef CONFIG_INTR_REMAP +/* + * Migrate the MSI irq to another cpumask. This migration is + * done in the process context using interrupt-remapping hardware. + */ +static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) +{ + struct irq_cfg *cfg; + unsigned int dest; + cpumask_t tmp, cleanup_mask; + struct irte irte; + struct irq_desc *desc; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (get_irte(irq, &irte)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + irte.vector = cfg->vector; + irte.dest_id = IRTE_DEST(dest); + + /* + * atomically update the IRTE with the new destination and vector. + */ + modify_irte(irq, &irte); + + /* + * After this point, all the interrupts will start arriving + * at the new destination. So, time to cleanup the previous + * vector allocation. + */ + if (cfg->move_in_progress) { + cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); + cfg->move_cleanup_count = cpus_weight(cleanup_mask); + send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + cfg->move_in_progress = 0; + } + + desc = irq_to_desc(irq); + desc->affinity = mask; +} +#endif +#endif /* CONFIG_SMP */ + +/* + * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, + * which implement the MSI or MSI-X Capability Structure. + */ +static struct irq_chip msi_chip = { + .name = "PCI-MSI", + .unmask = unmask_msi_irq, + .mask = mask_msi_irq, + .ack = ack_apic_edge, +#ifdef CONFIG_SMP + .set_affinity = set_msi_irq_affinity, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +#ifdef CONFIG_INTR_REMAP +static struct irq_chip msi_ir_chip = { + .name = "IR-PCI-MSI", + .unmask = unmask_msi_irq, + .mask = mask_msi_irq, + .ack = ack_x2apic_edge, +#ifdef CONFIG_SMP + .set_affinity = ir_set_msi_irq_affinity, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +/* + * Map the PCI dev to the corresponding remapping hardware unit + * and allocate 'nvec' consecutive interrupt-remapping table entries + * in it. + */ +static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) +{ + struct intel_iommu *iommu; + int index; + + iommu = map_dev_to_ir(dev); + if (!iommu) { + printk(KERN_ERR + "Unable to map PCI %s to iommu\n", pci_name(dev)); + return -ENOENT; + } + + index = alloc_irte(iommu, irq, nvec); + if (index < 0) { + printk(KERN_ERR + "Unable to allocate %d IRTE for PCI %s\n", nvec, + pci_name(dev)); + return -ENOSPC; + } + return index; +} +#endif + +static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) +{ + int ret; + struct msi_msg msg; + + ret = msi_compose_msg(dev, irq, &msg); + if (ret < 0) + return ret; + + set_irq_msi(irq, desc); + write_msi_msg(irq, &msg); + +#ifdef CONFIG_INTR_REMAP + if (irq_remapped(irq)) { + struct irq_desc *desc = irq_to_desc(irq); + /* + * irq migration in process context + */ + desc->status |= IRQ_MOVE_PCNTXT; + set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); + } else +#endif + set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); + + return 0; +} + +static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) +{ + unsigned int irq; + + irq = dev->bus->number; + irq <<= 8; + irq |= dev->devfn; + irq <<= 12; + + return irq; +} + +int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) +{ + unsigned int irq; + int ret; + unsigned int irq_want; + + irq_want = build_irq_for_pci_dev(dev) + 0x100; + + irq = create_irq_nr(irq_want); + if (irq == 0) + return -1; + +#ifdef CONFIG_INTR_REMAP + if (!intr_remapping_enabled) + goto no_ir; + + ret = msi_alloc_irte(dev, irq, 1); + if (ret < 0) + goto error; +no_ir: +#endif + ret = setup_msi_irq(dev, desc, irq); + if (ret < 0) { + destroy_irq(irq); + return ret; + } + return 0; + +#ifdef CONFIG_INTR_REMAP +error: + destroy_irq(irq); + return ret; +#endif +} + +int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +{ + unsigned int irq; + int ret, sub_handle; + struct msi_desc *desc; + unsigned int irq_want; + +#ifdef CONFIG_INTR_REMAP + struct intel_iommu *iommu = 0; + int index = 0; +#endif + + irq_want = build_irq_for_pci_dev(dev) + 0x100; + sub_handle = 0; + list_for_each_entry(desc, &dev->msi_list, list) { + irq = create_irq_nr(irq_want--); + if (irq == 0) + return -1; +#ifdef CONFIG_INTR_REMAP + if (!intr_remapping_enabled) + goto no_ir; + + if (!sub_handle) { + /* + * allocate the consecutive block of IRTE's + * for 'nvec' + */ + index = msi_alloc_irte(dev, irq, nvec); + if (index < 0) { + ret = index; + goto error; + } + } else { + iommu = map_dev_to_ir(dev); + if (!iommu) { + ret = -ENOENT; + goto error; + } + /* + * setup the mapping between the irq and the IRTE + * base index, the sub_handle pointing to the + * appropriate interrupt remap table entry. + */ + set_irte_irq(irq, iommu, index, sub_handle); + } +no_ir: +#endif + ret = setup_msi_irq(dev, desc, irq); + if (ret < 0) + goto error; + sub_handle++; + } + return 0; + +error: + destroy_irq(irq); + return ret; +} + +void arch_teardown_msi_irq(unsigned int irq) +{ + destroy_irq(irq); +} + +#ifdef CONFIG_DMAR +#ifdef CONFIG_SMP +static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) +{ + struct irq_cfg *cfg; + struct msi_msg msg; + unsigned int dest; + cpumask_t tmp; + struct irq_desc *desc; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + dmar_msi_read(irq, &msg); + + msg.data &= ~MSI_DATA_VECTOR_MASK; + msg.data |= MSI_DATA_VECTOR(cfg->vector); + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg.address_lo |= MSI_ADDR_DEST_ID(dest); + + dmar_msi_write(irq, &msg); + desc = irq_to_desc(irq); + desc->affinity = mask; +} +#endif /* CONFIG_SMP */ + +struct irq_chip dmar_msi_type = { + .name = "DMAR_MSI", + .unmask = dmar_msi_unmask, + .mask = dmar_msi_mask, + .ack = ack_apic_edge, +#ifdef CONFIG_SMP + .set_affinity = dmar_msi_set_affinity, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +int arch_setup_dmar_msi(unsigned int irq) +{ + int ret; + struct msi_msg msg; + + ret = msi_compose_msg(NULL, irq, &msg); + if (ret < 0) + return ret; + dmar_msi_write(irq, &msg); + set_irq_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, + "edge"); + return 0; +} +#endif + +#endif /* CONFIG_PCI_MSI */ +/* + * Hypertransport interrupt support + */ +#ifdef CONFIG_HT_IRQ + +#ifdef CONFIG_SMP + +static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) +{ + struct ht_irq_msg msg; + fetch_ht_irq_msg(irq, &msg); + + msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK); + msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); + + msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest); + msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); + + write_ht_irq_msg(irq, &msg); +} + +static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) +{ + struct irq_cfg *cfg; + unsigned int dest; + cpumask_t tmp; + struct irq_desc *desc; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + target_ht_irq(irq, dest, cfg->vector); + desc = irq_to_desc(irq); + desc->affinity = mask; +} +#endif + +static struct irq_chip ht_irq_chip = { + .name = "PCI-HT", + .mask = mask_ht_irq, + .unmask = unmask_ht_irq, + .ack = ack_apic_edge, +#ifdef CONFIG_SMP + .set_affinity = set_ht_irq_affinity, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) +{ + struct irq_cfg *cfg; + int err; + cpumask_t tmp; + + tmp = TARGET_CPUS; + err = assign_irq_vector(irq, tmp); + if (!err) { + struct ht_irq_msg msg; + unsigned dest; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, tmp); + dest = cpu_mask_to_apicid(tmp); + + msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); + + msg.address_lo = + HT_IRQ_LOW_BASE | + HT_IRQ_LOW_DEST_ID(dest) | + HT_IRQ_LOW_VECTOR(cfg->vector) | + ((INT_DEST_MODE == 0) ? + HT_IRQ_LOW_DM_PHYSICAL : + HT_IRQ_LOW_DM_LOGICAL) | + HT_IRQ_LOW_RQEOI_EDGE | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + HT_IRQ_LOW_MT_FIXED : + HT_IRQ_LOW_MT_ARBITRATED) | + HT_IRQ_LOW_IRQ_MASKED; + + write_ht_irq_msg(irq, &msg); + + set_irq_chip_and_handler_name(irq, &ht_irq_chip, + handle_edge_irq, "edge"); + } + return err; +} +#endif /* CONFIG_HT_IRQ */ + +/* -------------------------------------------------------------------------- + ACPI-based IOAPIC Configuration + -------------------------------------------------------------------------- */ + +#ifdef CONFIG_ACPI + +#ifdef CONFIG_X86_32 +int __init io_apic_get_unique_id(int ioapic, int apic_id) +{ + union IO_APIC_reg_00 reg_00; + static physid_mask_t apic_id_map = PHYSID_MASK_NONE; + physid_mask_t tmp; + unsigned long flags; + int i = 0; + + /* + * The P4 platform supports up to 256 APIC IDs on two separate APIC + * buses (one for LAPICs, one for IOAPICs), where predecessors only + * supports up to 16 on one shared APIC bus. + * + * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full + * advantage of new APIC bus architecture. + */ + + if (physids_empty(apic_id_map)) + apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(ioapic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + if (apic_id >= get_physical_broadcast()) { + printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " + "%d\n", ioapic, apic_id, reg_00.bits.ID); + apic_id = reg_00.bits.ID; + } + + /* + * Every APIC in a system must have a unique ID or we get lots of nice + * 'stuck on smp_invalidate_needed IPI wait' messages. + */ + if (check_apicid_used(apic_id_map, apic_id)) { + + for (i = 0; i < get_physical_broadcast(); i++) { + if (!check_apicid_used(apic_id_map, i)) + break; + } + + if (i == get_physical_broadcast()) + panic("Max apic_id exceeded!\n"); + + printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " + "trying %d\n", ioapic, apic_id, i); + + apic_id = i; + } + + tmp = apicid_to_cpu_present(apic_id); + physids_or(apic_id_map, apic_id_map, tmp); + + if (reg_00.bits.ID != apic_id) { + reg_00.bits.ID = apic_id; + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(ioapic, 0, reg_00.raw); + reg_00.raw = io_apic_read(ioapic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + /* Sanity check */ + if (reg_00.bits.ID != apic_id) { + printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); + return -1; + } + } + + apic_printk(APIC_VERBOSE, KERN_INFO + "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); + + return apic_id; +} + +int __init io_apic_get_version(int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return reg_01.bits.version; +} +#endif + +int __init io_apic_get_redir_entries (int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return reg_01.bits.entries; +} + + +int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) +{ + if (!IO_APIC_IRQ(irq)) { + apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", + ioapic); + return -EINVAL; + } + + /* + * IRQs < 16 are already in the irq_2_pin[] map + */ + if (irq >= 16) + add_pin_to_irq(irq, ioapic, pin); + + setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity); + + return 0; +} + + +int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) +{ + int i; + + if (skip_ioapic_setup) + return -1; + + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].mp_irqtype == mp_INT && + mp_irqs[i].mp_srcbusirq == bus_irq) + break; + if (i >= mp_irq_entries) + return -1; + + *trigger = irq_trigger(i); + *polarity = irq_polarity(i); + return 0; +} + +#endif /* CONFIG_ACPI */ + +/* + * This function currently is only a helper for the i386 smp boot process where + * we need to reprogram the ioredtbls to cater for the cpus which have come online + * so mask in all cases should simply be TARGET_CPUS + */ +#ifdef CONFIG_SMP +void __init setup_ioapic_dest(void) +{ + int pin, ioapic, irq, irq_entry; + struct irq_cfg *cfg; + + if (skip_ioapic_setup == 1) + return; + + for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { + for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { + irq_entry = find_irq_entry(ioapic, pin, mp_INT); + if (irq_entry == -1) + continue; + irq = pin_2_irq(irq_entry, ioapic, pin); + + /* setup_IO_APIC_irqs could fail to get vector for some device + * when you have too many devices, because at that time only boot + * cpu is online. + */ + cfg = irq_cfg(irq); + if (!cfg->vector) + setup_IO_APIC_irq(ioapic, pin, irq, + irq_trigger(irq_entry), + irq_polarity(irq_entry)); +#ifdef CONFIG_INTR_REMAP + else if (intr_remapping_enabled) + set_ir_ioapic_affinity_irq(irq, TARGET_CPUS); +#endif + else + set_ioapic_affinity_irq(irq, TARGET_CPUS); + } + + } +} +#endif + +#ifdef CONFIG_X86_64 +#define IOAPIC_RESOURCE_NAME_SIZE 11 + +static struct resource *ioapic_resources; + +static struct resource * __init ioapic_setup_resources(void) +{ + unsigned long n; + struct resource *res; + char *mem; + int i; + + if (nr_ioapics <= 0) + return NULL; + + n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); + n *= nr_ioapics; + + mem = alloc_bootmem(n); + res = (void *)mem; + + if (mem != NULL) { + mem += sizeof(struct resource) * nr_ioapics; + + for (i = 0; i < nr_ioapics; i++) { + res[i].name = mem; + res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; + sprintf(mem, "IOAPIC %u", i); + mem += IOAPIC_RESOURCE_NAME_SIZE; + } + } + + ioapic_resources = res; + + return res; +} +#endif + +void __init ioapic_init_mappings(void) +{ + unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; + int i; +#ifdef CONFIG_X86_64 + struct resource *ioapic_res; + + ioapic_res = ioapic_setup_resources(); +#endif + for (i = 0; i < nr_ioapics; i++) { + if (smp_found_config) { + ioapic_phys = mp_ioapics[i].mp_apicaddr; +#ifdef CONFIG_X86_32 + if (!ioapic_phys) { + printk(KERN_ERR + "WARNING: bogus zero IO-APIC " + "address found in MPTABLE, " + "disabling IO/APIC support!\n"); + smp_found_config = 0; + skip_ioapic_setup = 1; + goto fake_ioapic_page; + } +#endif + } else { +#ifdef CONFIG_X86_32 +fake_ioapic_page: +#endif + ioapic_phys = (unsigned long) + alloc_bootmem_pages(PAGE_SIZE); + ioapic_phys = __pa(ioapic_phys); + } + set_fixmap_nocache(idx, ioapic_phys); + apic_printk(APIC_VERBOSE, + "mapped IOAPIC to %08lx (%08lx)\n", + __fix_to_virt(idx), ioapic_phys); + idx++; + +#ifdef CONFIG_X86_64 + if (ioapic_res != NULL) { + ioapic_res->start = ioapic_phys; + ioapic_res->end = ioapic_phys + (4 * 1024) - 1; + ioapic_res++; + } +#endif + } +} + +#ifdef CONFIG_X86_64 +static int __init ioapic_insert_resources(void) +{ + int i; + struct resource *r = ioapic_resources; + + if (!r) { + printk(KERN_ERR + "IO APIC resources could be not be allocated.\n"); + return -1; + } + + for (i = 0; i < nr_ioapics; i++) { + insert_resource(&iomem_resource, r); + r++; + } + + return 0; +} + +/* Insert the IO APIC resources after PCI initialization has occured to handle + * IO APICS that are mapped in on a BAR in PCI space. */ +late_initcall(ioapic_insert_resources); +#endif diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c deleted file mode 100644 index fba6d6ee348..00000000000 --- a/arch/x86/kernel/io_apic_32.c +++ /dev/null @@ -1,3913 +0,0 @@ -/* - * Intel IO-APIC support for multi-Pentium hosts. - * - * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo - * - * Many thanks to Stig Venaas for trying out countless experimental - * patches and reporting/debugging problems patiently! - * - * (c) 1999, Multiple IO-APIC support, developed by - * Ken-ichi Yaku and - * Hidemi Kishimoto , - * further tested and cleaned up by Zach Brown - * and Ingo Molnar - * - * Fixes - * Maciej W. Rozycki : Bits for genuine 82489DX APICs; - * thanks to Eric Gilmore - * and Rolf G. Tews - * for testing these extensively - * Paul Diefenbaugh : Added full ACPI support - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include /* time_after() */ -#ifdef CONFIG_ACPI -#include -#endif -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#define __apicdebuginit(type) static type __init - -/* - * Is the SiS APIC rmw bug present ? - * -1 = don't know, 0 = no, 1 = yes - */ -int sis_apic_bug = -1; - -static DEFINE_SPINLOCK(ioapic_lock); -static DEFINE_SPINLOCK(vector_lock); - -int first_free_entry; -/* - * Rough estimation of how many shared IRQs there are, can - * be changed anytime. - */ -int pin_map_size; - -/* - * # of IRQ routing registers - */ -int nr_ioapic_registers[MAX_IO_APICS]; - -/* I/O APIC entries */ -struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; -int nr_ioapics; - -/* MP IRQ source entries */ -struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; - -/* # of MP IRQ source entries */ -int mp_irq_entries; - -#if defined (CONFIG_MCA) || defined (CONFIG_EISA) -int mp_bus_id_to_type[MAX_MP_BUSSES]; -#endif - -DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); - -int skip_ioapic_setup; - -static int __init parse_noapic(char *str) -{ - /* disable IO-APIC */ - disable_ioapic_setup(); - return 0; -} -early_param("noapic", parse_noapic); - -struct irq_cfg; -struct irq_pin_list; -struct irq_cfg { - unsigned int irq; - struct irq_cfg *next; - struct irq_pin_list *irq_2_pin; - cpumask_t domain; - cpumask_t old_domain; - unsigned move_cleanup_count; - u8 vector; - u8 move_in_progress : 1; -}; - -/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ -static struct irq_cfg irq_cfg_legacy[] __initdata = { - [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, - [1] = { .irq = 1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, - [2] = { .irq = 2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, - [3] = { .irq = 3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, }, - [4] = { .irq = 4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, }, - [5] = { .irq = 5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, }, - [6] = { .irq = 6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, }, - [7] = { .irq = 7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, }, - [8] = { .irq = 8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, }, - [9] = { .irq = 9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, }, - [10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, - [11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, - [12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, - [13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, - [14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, - [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, -}; - -static struct irq_cfg irq_cfg_init = { .irq = -1U, }; -/* need to be biger than size of irq_cfg_legacy */ -static int nr_irq_cfg = 32; - -static int __init parse_nr_irq_cfg(char *arg) -{ - if (arg) { - nr_irq_cfg = simple_strtoul(arg, NULL, 0); - if (nr_irq_cfg < 32) - nr_irq_cfg = 32; - } - return 0; -} - -early_param("nr_irq_cfg", parse_nr_irq_cfg); - -static void init_one_irq_cfg(struct irq_cfg *cfg) -{ - memcpy(cfg, &irq_cfg_init, sizeof(struct irq_cfg)); -} - -static struct irq_cfg *irq_cfgx; -static struct irq_cfg *irq_cfgx_free; -static void __init init_work(void *data) -{ - struct dyn_array *da = data; - struct irq_cfg *cfg; - int legacy_count; - int i; - - cfg = *da->name; - - memcpy(cfg, irq_cfg_legacy, sizeof(irq_cfg_legacy)); - - legacy_count = sizeof(irq_cfg_legacy)/sizeof(irq_cfg_legacy[0]); - for (i = legacy_count; i < *da->nr; i++) - init_one_irq_cfg(&cfg[i]); - - for (i = 1; i < *da->nr; i++) - cfg[i-1].next = &cfg[i]; - - irq_cfgx_free = &irq_cfgx[legacy_count]; - irq_cfgx[legacy_count - 1].next = NULL; -} - -#define for_each_irq_cfg(cfg) \ - for (cfg = irq_cfgx; cfg; cfg = cfg->next) - -DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irq_cfg, PAGE_SIZE, init_work); - -static struct irq_cfg *irq_cfg(unsigned int irq) -{ - struct irq_cfg *cfg; - - cfg = irq_cfgx; - while (cfg) { - if (cfg->irq == irq) - return cfg; - - cfg = cfg->next; - } - - return NULL; -} - -static struct irq_cfg *irq_cfg_alloc(unsigned int irq) -{ - struct irq_cfg *cfg, *cfg_pri; - int i; - int count = 0; - - cfg_pri = cfg = irq_cfgx; - while (cfg) { - if (cfg->irq == irq) - return cfg; - - cfg_pri = cfg; - cfg = cfg->next; - count++; - } - - if (!irq_cfgx_free) { - unsigned long phys; - unsigned long total_bytes; - /* - * we run out of pre-allocate ones, allocate more - */ - printk(KERN_DEBUG "try to get more irq_cfg %d\n", nr_irq_cfg); - - total_bytes = sizeof(struct irq_cfg) * nr_irq_cfg; - if (after_bootmem) - cfg = kzalloc(total_bytes, GFP_ATOMIC); - else - cfg = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0); - - if (!cfg) - panic("please boot with nr_irq_cfg= %d\n", count * 2); - - phys = __pa(cfg); - printk(KERN_DEBUG "irq_irq ==> [%#lx - %#lx]\n", phys, phys + total_bytes); - - for (i = 0; i < nr_irq_cfg; i++) - init_one_irq_cfg(&cfg[i]); - - for (i = 1; i < nr_irq_cfg; i++) - cfg[i-1].next = &cfg[i]; - - irq_cfgx_free = cfg; - } - - cfg = irq_cfgx_free; - irq_cfgx_free = irq_cfgx_free->next; - cfg->next = NULL; - if (cfg_pri) - cfg_pri->next = cfg; - else - irq_cfgx = cfg; - cfg->irq = irq; - printk(KERN_DEBUG "found new irq_cfg for irq %d\n", cfg->irq); -#ifdef CONFIG_HAVE_SPARSE_IRQ_DEBUG - { - /* dump the results */ - struct irq_cfg *cfg; - unsigned long phys; - unsigned long bytes = sizeof(struct irq_cfg); - - printk(KERN_DEBUG "=========================== %d\n", irq); - printk(KERN_DEBUG "irq_cfg dump after get that for %d\n", irq); - for_each_irq_cfg(cfg) { - phys = __pa(cfg); - printk(KERN_DEBUG "irq_cfg %d ==> [%#lx - %#lx]\n", cfg->irq, phys, phys + bytes); - } - printk(KERN_DEBUG "===========================\n"); - } -#endif - return cfg; -} - -/* - * This is performance-critical, we want to do it O(1) - * - * the indexing order of this array favors 1:1 mappings - * between pins and IRQs. - */ - -struct irq_pin_list { - int apic, pin; - struct irq_pin_list *next; -}; - -static struct irq_pin_list *irq_2_pin_head; -/* fill one page ? */ -static int nr_irq_2_pin = 0x100; -static struct irq_pin_list *irq_2_pin_ptr; -static void __init irq_2_pin_init_work(void *data) -{ - struct dyn_array *da = data; - struct irq_pin_list *pin; - int i; - - pin = *da->name; - - for (i = 1; i < *da->nr; i++) - pin[i-1].next = &pin[i]; - - irq_2_pin_ptr = &pin[0]; -} -DEFINE_DYN_ARRAY(irq_2_pin_head, sizeof(struct irq_pin_list), nr_irq_2_pin, PAGE_SIZE, irq_2_pin_init_work); - -static struct irq_pin_list *get_one_free_irq_2_pin(void) -{ - struct irq_pin_list *pin; - int i; - - pin = irq_2_pin_ptr; - - if (pin) { - irq_2_pin_ptr = pin->next; - pin->next = NULL; - return pin; - } - - /* - * we run out of pre-allocate ones, allocate more - */ - printk(KERN_DEBUG "try to get more irq_2_pin %d\n", nr_irq_2_pin); - - if (after_bootmem) - pin = kzalloc(sizeof(struct irq_pin_list)*nr_irq_2_pin, - GFP_ATOMIC); - else - pin = __alloc_bootmem_nopanic(sizeof(struct irq_pin_list) * - nr_irq_2_pin, PAGE_SIZE, 0); - - if (!pin) - panic("can not get more irq_2_pin\n"); - - for (i = 1; i < nr_irq_2_pin; i++) - pin[i-1].next = &pin[i]; - - irq_2_pin_ptr = pin->next; - pin->next = NULL; - - return pin; -} - -struct io_apic { - unsigned int index; - unsigned int unused[3]; - unsigned int data; -}; - -static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) -{ - return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) - + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK); -} - -static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - writel(reg, &io_apic->index); - return readl(&io_apic->data); -} - -static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - writel(reg, &io_apic->index); - writel(value, &io_apic->data); -} - -/* - * Re-write a value: to be used for read-modify-write - * cycles where the read already set up the index register. - * - * Older SiS APIC requires we rewrite the index register - */ -static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - if (sis_apic_bug) - writel(reg, &io_apic->index); - writel(value, &io_apic->data); -} - -#ifdef CONFIG_X86_64 -static bool io_apic_level_ack_pending(unsigned int irq) -{ - struct irq_pin_list *entry; - unsigned long flags; - struct irq_cfg *cfg = irq_cfg(irq); - - spin_lock_irqsave(&ioapic_lock, flags); - entry = cfg->irq_2_pin; - for (;;) { - unsigned int reg; - int pin; - - if (!entry) - break; - pin = entry->pin; - reg = io_apic_read(entry->apic, 0x10 + pin*2); - /* Is the remote IRR bit set? */ - if (reg & IO_APIC_REDIR_REMOTE_IRR) { - spin_unlock_irqrestore(&ioapic_lock, flags); - return true; - } - if (!entry->next) - break; - entry = entry->next; - } - spin_unlock_irqrestore(&ioapic_lock, flags); - - return false; -} -#endif - -union entry_union { - struct { u32 w1, w2; }; - struct IO_APIC_route_entry entry; -}; - -static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) -{ - union entry_union eu; - unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); - eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); - eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); - return eu.entry; -} - -/* - * When we write a new IO APIC routing entry, we need to write the high - * word first! If the mask bit in the low word is clear, we will enable - * the interrupt, and we need to make sure the entry is fully populated - * before that happens. - */ -static void -__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) -{ - union entry_union eu; - eu.entry = e; - io_apic_write(apic, 0x11 + 2*pin, eu.w2); - io_apic_write(apic, 0x10 + 2*pin, eu.w1); -} - -static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) -{ - unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); - __ioapic_write_entry(apic, pin, e); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -/* - * When we mask an IO APIC routing entry, we need to write the low - * word first, in order to set the mask bit before we change the - * high bits! - */ -static void ioapic_mask_entry(int apic, int pin) -{ - unsigned long flags; - union entry_union eu = { .entry.mask = 1 }; - - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x10 + 2*pin, eu.w1); - io_apic_write(apic, 0x11 + 2*pin, eu.w2); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -#ifdef CONFIG_SMP -static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) -{ - int apic, pin; - struct irq_cfg *cfg; - struct irq_pin_list *entry; - - cfg = irq_cfg(irq); - entry = cfg->irq_2_pin; - for (;;) { - unsigned int reg; - - if (!entry) - break; - - apic = entry->apic; - pin = entry->pin; -#ifdef CONFIG_INTR_REMAP - /* - * With interrupt-remapping, destination information comes - * from interrupt-remapping table entry. - */ - if (!irq_remapped(irq)) - io_apic_write(apic, 0x11 + pin*2, dest); -#else - io_apic_write(apic, 0x11 + pin*2, dest); -#endif - reg = io_apic_read(apic, 0x10 + pin*2); - reg &= ~IO_APIC_REDIR_VECTOR_MASK; - reg |= vector; - io_apic_modify(apic, 0x10 + pin*2, reg); - if (!entry->next) - break; - entry = entry->next; - } -} - -static int assign_irq_vector(int irq, cpumask_t mask); - -static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) -{ - struct irq_cfg *cfg; - unsigned long flags; - unsigned int dest; - cpumask_t tmp; - struct irq_desc *desc; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; - - cfg = irq_cfg(irq); - if (assign_irq_vector(irq, mask)) - return; - - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - /* - * Only the high 8 bits are valid. - */ - dest = SET_APIC_LOGICAL_ID(dest); - - desc = irq_to_desc(irq); - spin_lock_irqsave(&ioapic_lock, flags); - __target_IO_APIC_irq(irq, dest, cfg->vector); - desc->affinity = mask; - spin_unlock_irqrestore(&ioapic_lock, flags); -} -#endif /* CONFIG_SMP */ - -/* - * The common case is 1:1 IRQ<->pin mappings. Sometimes there are - * shared ISA-space IRQs, so we have to support them. We are super - * fast in the common case, and fast for shared ISA-space IRQs. - */ -static void add_pin_to_irq(unsigned int irq, int apic, int pin) -{ - struct irq_cfg *cfg; - struct irq_pin_list *entry; - - /* first time to refer irq_cfg, so with new */ - cfg = irq_cfg_alloc(irq); - entry = cfg->irq_2_pin; - if (!entry) { - entry = get_one_free_irq_2_pin(); - cfg->irq_2_pin = entry; - entry->apic = apic; - entry->pin = pin; - printk(KERN_DEBUG " 0 add_pin_to_irq: irq %d --> apic %d pin %d\n", irq, apic, pin); - return; - } - - while (entry->next) { - /* not again, please */ - if (entry->apic == apic && entry->pin == pin) - return; - - entry = entry->next; - } - - entry->next = get_one_free_irq_2_pin(); - entry = entry->next; - entry->apic = apic; - entry->pin = pin; - printk(KERN_DEBUG " x add_pin_to_irq: irq %d --> apic %d pin %d\n", irq, apic, pin); -} - -/* - * Reroute an IRQ to a different pin. - */ -static void __init replace_pin_at_irq(unsigned int irq, - int oldapic, int oldpin, - int newapic, int newpin) -{ - struct irq_cfg *cfg = irq_cfg(irq); - struct irq_pin_list *entry = cfg->irq_2_pin; - int replaced = 0; - - while (entry) { - if (entry->apic == oldapic && entry->pin == oldpin) { - entry->apic = newapic; - entry->pin = newpin; - replaced = 1; - /* every one is different, right? */ - break; - } - entry = entry->next; - } - - /* why? call replace before add? */ - if (!replaced) - add_pin_to_irq(irq, newapic, newpin); -} - -#ifdef CONFIG_X86_64 -/* - * Synchronize the IO-APIC and the CPU by doing - * a dummy read from the IO-APIC - */ -static inline void io_apic_sync(unsigned int apic) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - readl(&io_apic->data); -} - -#define __DO_ACTION(R, ACTION, FINAL) \ - \ -{ \ - int pin; \ - struct irq_cfg *cfg; \ - struct irq_pin_list *entry; \ - \ - cfg = irq_cfg(irq); \ - entry = cfg->irq_2_pin; \ - for (;;) { \ - unsigned int reg; \ - if (!entry) \ - break; \ - pin = entry->pin; \ - reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ - reg ACTION; \ - io_apic_modify(entry->apic, 0x10 + R + pin*2, reg); \ - FINAL; \ - if (!entry->next) \ - break; \ - entry = entry->next; \ - } \ -} - -#define DO_ACTION(name,R,ACTION, FINAL) \ - \ - static void name##_IO_APIC_irq (unsigned int irq) \ - __DO_ACTION(R, ACTION, FINAL) - -/* mask = 1 */ -DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic)) - -/* mask = 0 */ -DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, ) - -#else - -static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable) -{ - struct irq_cfg *cfg; - struct irq_pin_list *entry; - unsigned int pin, reg; - - cfg = irq_cfg(irq); - entry = cfg->irq_2_pin; - for (;;) { - if (!entry) - break; - pin = entry->pin; - reg = io_apic_read(entry->apic, 0x10 + pin*2); - reg &= ~disable; - reg |= enable; - io_apic_modify(entry->apic, 0x10 + pin*2, reg); - if (!entry->next) - break; - entry = entry->next; - } -} - -/* mask = 1 */ -static void __mask_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0); -} - -/* mask = 0 */ -static void __unmask_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED); -} - -/* mask = 1, trigger = 0 */ -static void __mask_and_edge_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, - IO_APIC_REDIR_LEVEL_TRIGGER); -} - -/* mask = 0, trigger = 1 */ -static void __unmask_and_level_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER, - IO_APIC_REDIR_MASKED); -} - -#endif - -static void mask_IO_APIC_irq (unsigned int irq) -{ - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - __mask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -static void unmask_IO_APIC_irq (unsigned int irq) -{ - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - __unmask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) -{ - struct IO_APIC_route_entry entry; - - /* Check delivery_mode to be sure we're not clearing an SMI pin */ - entry = ioapic_read_entry(apic, pin); - if (entry.delivery_mode == dest_SMI) - return; - /* - * Disable it in the IO-APIC irq-routing table: - */ - ioapic_mask_entry(apic, pin); -} - -static void clear_IO_APIC (void) -{ - int apic, pin; - - for (apic = 0; apic < nr_ioapics; apic++) - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) - clear_IO_APIC_pin(apic, pin); -} - -#if !defined(CONFIG_SMP) && defined(CONFIG_X86_32) -void send_IPI_self(int vector) -{ - unsigned int cfg; - - /* - * Wait for idle. - */ - apic_wait_icr_idle(); - cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL; - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - apic_write(APIC_ICR, cfg); -} -#endif /* !CONFIG_SMP && CONFIG_X86_32*/ - -#ifdef CONFIG_X86_32 -/* - * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to - * specific CPU-side IRQs. - */ - -#define MAX_PIRQS 8 -static int pirq_entries [MAX_PIRQS]; -static int pirqs_enabled; - -static int __init ioapic_pirq_setup(char *str) -{ - int i, max; - int ints[MAX_PIRQS+1]; - - get_options(str, ARRAY_SIZE(ints), ints); - - for (i = 0; i < MAX_PIRQS; i++) - pirq_entries[i] = -1; - - pirqs_enabled = 1; - apic_printk(APIC_VERBOSE, KERN_INFO - "PIRQ redirection, working around broken MP-BIOS.\n"); - max = MAX_PIRQS; - if (ints[0] < MAX_PIRQS) - max = ints[0]; - - for (i = 0; i < max; i++) { - apic_printk(APIC_VERBOSE, KERN_DEBUG - "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); - /* - * PIRQs are mapped upside down, usually. - */ - pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; - } - return 1; -} - -__setup("pirq=", ioapic_pirq_setup); -#endif /* CONFIG_X86_32 */ - -#ifdef CONFIG_INTR_REMAP -/* I/O APIC RTE contents at the OS boot up */ -static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS]; - -/* - * Saves and masks all the unmasked IO-APIC RTE's - */ -int save_mask_IO_APIC_setup(void) -{ - union IO_APIC_reg_01 reg_01; - unsigned long flags; - int apic, pin; - - /* - * The number of IO-APIC IRQ registers (== #pins): - */ - for (apic = 0; apic < nr_ioapics; apic++) { - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(apic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - nr_ioapic_registers[apic] = reg_01.bits.entries+1; - } - - for (apic = 0; apic < nr_ioapics; apic++) { - early_ioapic_entries[apic] = - kzalloc(sizeof(struct IO_APIC_route_entry) * - nr_ioapic_registers[apic], GFP_KERNEL); - if (!early_ioapic_entries[apic]) - return -ENOMEM; - } - - for (apic = 0; apic < nr_ioapics; apic++) - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - struct IO_APIC_route_entry entry; - - entry = early_ioapic_entries[apic][pin] = - ioapic_read_entry(apic, pin); - if (!entry.mask) { - entry.mask = 1; - ioapic_write_entry(apic, pin, entry); - } - } - return 0; -} - -void restore_IO_APIC_setup(void) -{ - int apic, pin; - - for (apic = 0; apic < nr_ioapics; apic++) - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) - ioapic_write_entry(apic, pin, - early_ioapic_entries[apic][pin]); -} - -void reinit_intr_remapped_IO_APIC(int intr_remapping) -{ - /* - * for now plain restore of previous settings. - * TBD: In the case of OS enabling interrupt-remapping, - * IO-APIC RTE's need to be setup to point to interrupt-remapping - * table entries. for now, do a plain restore, and wait for - * the setup_IO_APIC_irqs() to do proper initialization. - */ - restore_IO_APIC_setup(); -} -#endif - -/* - * Find the IRQ entry number of a certain pin. - */ -static int find_irq_entry(int apic, int pin, int type) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mp_irqtype == type && - (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid || - mp_irqs[i].mp_dstapic == MP_APIC_ALL) && - mp_irqs[i].mp_dstirq == pin) - return i; - - return -1; -} - -/* - * Find the pin to which IRQ[irq] (ISA) is connected - */ -static int __init find_isa_irq_pin(int irq, int type) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mp_srcbus; - - if (test_bit(lbus, mp_bus_not_pci) && - (mp_irqs[i].mp_irqtype == type) && - (mp_irqs[i].mp_srcbusirq == irq)) - - return mp_irqs[i].mp_dstirq; - } - return -1; -} - -static int __init find_isa_irq_apic(int irq, int type) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mp_srcbus; - - if (test_bit(lbus, mp_bus_not_pci) && - (mp_irqs[i].mp_irqtype == type) && - (mp_irqs[i].mp_srcbusirq == irq)) - break; - } - if (i < mp_irq_entries) { - int apic; - for(apic = 0; apic < nr_ioapics; apic++) { - if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic) - return apic; - } - } - - return -1; -} - -/* - * Find a specific PCI IRQ entry. - * Not an __init, possibly needed by modules - */ -static int pin_2_irq(int idx, int apic, int pin); - -int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) -{ - int apic, i, best_guess = -1; - - apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", - bus, slot, pin); - if (test_bit(bus, mp_bus_not_pci)) { - apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); - return -1; - } - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mp_srcbus; - - for (apic = 0; apic < nr_ioapics; apic++) - if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic || - mp_irqs[i].mp_dstapic == MP_APIC_ALL) - break; - - if (!test_bit(lbus, mp_bus_not_pci) && - !mp_irqs[i].mp_irqtype && - (bus == lbus) && - (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) { - int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq); - - if (!(apic || IO_APIC_IRQ(irq))) - continue; - - if (pin == (mp_irqs[i].mp_srcbusirq & 3)) - return irq; - /* - * Use the first all-but-pin matching entry as a - * best-guess fuzzy result for broken mptables. - */ - if (best_guess < 0) - best_guess = irq; - } - } - return best_guess; -} - -EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); - -#if defined(CONFIG_EISA) || defined(CONFIG_MCA) -/* - * EISA Edge/Level control register, ELCR - */ -static int EISA_ELCR(unsigned int irq) -{ - if (irq < 16) { - unsigned int port = 0x4d0 + (irq >> 3); - return (inb(port) >> (irq & 7)) & 1; - } - apic_printk(APIC_VERBOSE, KERN_INFO - "Broken MPtable reports ISA irq %d\n", irq); - return 0; -} - -#endif - -/* ISA interrupts are always polarity zero edge triggered, - * when listed as conforming in the MP table. */ - -#define default_ISA_trigger(idx) (0) -#define default_ISA_polarity(idx) (0) - -/* EISA interrupts are always polarity zero and can be edge or level - * trigger depending on the ELCR value. If an interrupt is listed as - * EISA conforming in the MP table, that means its trigger type must - * be read in from the ELCR */ - -#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq)) -#define default_EISA_polarity(idx) default_ISA_polarity(idx) - -/* PCI interrupts are always polarity one level triggered, - * when listed as conforming in the MP table. */ - -#define default_PCI_trigger(idx) (1) -#define default_PCI_polarity(idx) (1) - -/* MCA interrupts are always polarity zero level triggered, - * when listed as conforming in the MP table. */ - -#define default_MCA_trigger(idx) (1) -#define default_MCA_polarity(idx) default_ISA_polarity(idx) - -static int MPBIOS_polarity(int idx) -{ - int bus = mp_irqs[idx].mp_srcbus; - int polarity; - - /* - * Determine IRQ line polarity (high active or low active): - */ - switch (mp_irqs[idx].mp_irqflag & 3) - { - case 0: /* conforms, ie. bus-type dependent polarity */ - if (test_bit(bus, mp_bus_not_pci)) - polarity = default_ISA_polarity(idx); - else - polarity = default_PCI_polarity(idx); - break; - case 1: /* high active */ - { - polarity = 0; - break; - } - case 2: /* reserved */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - case 3: /* low active */ - { - polarity = 1; - break; - } - default: /* invalid */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - } - return polarity; -} - -static int MPBIOS_trigger(int idx) -{ - int bus = mp_irqs[idx].mp_srcbus; - int trigger; - - /* - * Determine IRQ trigger mode (edge or level sensitive): - */ - switch ((mp_irqs[idx].mp_irqflag>>2) & 3) - { - case 0: /* conforms, ie. bus-type dependent */ - if (test_bit(bus, mp_bus_not_pci)) - trigger = default_ISA_trigger(idx); - else - trigger = default_PCI_trigger(idx); -#if defined(CONFIG_EISA) || defined(CONFIG_MCA) - switch (mp_bus_id_to_type[bus]) { - case MP_BUS_ISA: /* ISA pin */ - { - /* set before the switch */ - break; - } - case MP_BUS_EISA: /* EISA pin */ - { - trigger = default_EISA_trigger(idx); - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - /* set before the switch */ - break; - } - case MP_BUS_MCA: /* MCA pin */ - { - trigger = default_MCA_trigger(idx); - break; - } - default: - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 1; - break; - } - } -#endif - break; - case 1: /* edge */ - { - trigger = 0; - break; - } - case 2: /* reserved */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 1; - break; - } - case 3: /* level */ - { - trigger = 1; - break; - } - default: /* invalid */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 0; - break; - } - } - return trigger; -} - -static inline int irq_polarity(int idx) -{ - return MPBIOS_polarity(idx); -} - -static inline int irq_trigger(int idx) -{ - return MPBIOS_trigger(idx); -} - -int (*ioapic_renumber_irq)(int ioapic, int irq); -static int pin_2_irq(int idx, int apic, int pin) -{ - int irq, i; - int bus = mp_irqs[idx].mp_srcbus; - - /* - * Debugging check, we are in big trouble if this message pops up! - */ - if (mp_irqs[idx].mp_dstirq != pin) - printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); - - if (test_bit(bus, mp_bus_not_pci)) { - irq = mp_irqs[idx].mp_srcbusirq; - } else { - /* - * PCI IRQs are mapped in order - */ - i = irq = 0; - while (i < apic) - irq += nr_ioapic_registers[i++]; - irq += pin; - /* - * For MPS mode, so far only needed by ES7000 platform - */ - if (ioapic_renumber_irq) - irq = ioapic_renumber_irq(apic, irq); - } - -#ifdef CONFIG_X86_32 - /* - * PCI IRQ command line redirection. Yes, limits are hardcoded. - */ - if ((pin >= 16) && (pin <= 23)) { - if (pirq_entries[pin-16] != -1) { - if (!pirq_entries[pin-16]) { - apic_printk(APIC_VERBOSE, KERN_DEBUG - "disabling PIRQ%d\n", pin-16); - } else { - irq = pirq_entries[pin-16]; - apic_printk(APIC_VERBOSE, KERN_DEBUG - "using PIRQ%d -> IRQ %d\n", - pin-16, irq); - } - } - } -#endif - - return irq; -} - -void lock_vector_lock(void) -{ - /* Used to the online set of cpus does not change - * during assign_irq_vector. - */ - spin_lock(&vector_lock); -} - -void unlock_vector_lock(void) -{ - spin_unlock(&vector_lock); -} - -static int __assign_irq_vector(int irq, cpumask_t mask) -{ - /* - * NOTE! The local APIC isn't very good at handling - * multiple interrupts at the same interrupt level. - * As the interrupt level is determined by taking the - * vector number and shifting that right by 4, we - * want to spread these out a bit so that they don't - * all fall in the same interrupt level. - * - * Also, we've got to be careful not to trash gate - * 0x80, because int 0x80 is hm, kind of importantish. ;) - */ - static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; - unsigned int old_vector; - int cpu; - struct irq_cfg *cfg; - - cfg = irq_cfg(irq); - - /* Only try and allocate irqs on cpus that are present */ - cpus_and(mask, mask, cpu_online_map); - - if ((cfg->move_in_progress) || cfg->move_cleanup_count) - return -EBUSY; - - old_vector = cfg->vector; - if (old_vector) { - cpumask_t tmp; - cpus_and(tmp, cfg->domain, mask); - if (!cpus_empty(tmp)) - return 0; - } - - for_each_cpu_mask_nr(cpu, mask) { - cpumask_t domain, new_mask; - int new_cpu; - int vector, offset; - - domain = vector_allocation_domain(cpu); - cpus_and(new_mask, domain, cpu_online_map); - - vector = current_vector; - offset = current_offset; -next: - vector += 8; - if (vector >= first_system_vector) { - /* If we run out of vectors on large boxen, must share them. */ - offset = (offset + 1) % 8; - vector = FIRST_DEVICE_VECTOR + offset; - } - if (unlikely(current_vector == vector)) - continue; -#ifdef CONFIG_X86_64 - if (vector == IA32_SYSCALL_VECTOR) - goto next; -#else - if (vector == SYSCALL_VECTOR) - goto next; -#endif - for_each_cpu_mask_nr(new_cpu, new_mask) - if (per_cpu(vector_irq, new_cpu)[vector] != -1) - goto next; - /* Found one! */ - current_vector = vector; - current_offset = offset; - if (old_vector) { - cfg->move_in_progress = 1; - cfg->old_domain = cfg->domain; - } - for_each_cpu_mask_nr(new_cpu, new_mask) - per_cpu(vector_irq, new_cpu)[vector] = irq; - cfg->vector = vector; - cfg->domain = domain; - return 0; - } - return -ENOSPC; -} - -static int assign_irq_vector(int irq, cpumask_t mask) -{ - int err; - unsigned long flags; - - spin_lock_irqsave(&vector_lock, flags); - err = __assign_irq_vector(irq, mask); - spin_unlock_irqrestore(&vector_lock, flags); - return err; -} - -static void __clear_irq_vector(int irq) -{ - struct irq_cfg *cfg; - cpumask_t mask; - int cpu, vector; - - cfg = irq_cfg(irq); - BUG_ON(!cfg->vector); - - vector = cfg->vector; - cpus_and(mask, cfg->domain, cpu_online_map); - for_each_cpu_mask_nr(cpu, mask) - per_cpu(vector_irq, cpu)[vector] = -1; - - cfg->vector = 0; - cpus_clear(cfg->domain); -} - -void __setup_vector_irq(int cpu) -{ - /* Initialize vector_irq on a new cpu */ - /* This function must be called with vector_lock held */ - int irq, vector; - struct irq_cfg *cfg; - - /* Mark the inuse vectors */ - for_each_irq_cfg(cfg) { - if (!cpu_isset(cpu, cfg->domain)) - continue; - vector = cfg->vector; - irq = cfg->irq; - per_cpu(vector_irq, cpu)[vector] = irq; - } - /* Mark the free vectors */ - for (vector = 0; vector < NR_VECTORS; ++vector) { - irq = per_cpu(vector_irq, cpu)[vector]; - if (irq < 0) - continue; - - cfg = irq_cfg(irq); - if (!cpu_isset(cpu, cfg->domain)) - per_cpu(vector_irq, cpu)[vector] = -1; - } -} - -static struct irq_chip ioapic_chip; -#ifdef CONFIG_INTR_REMAP -static struct irq_chip ir_ioapic_chip; -#endif - -#define IOAPIC_AUTO -1 -#define IOAPIC_EDGE 0 -#define IOAPIC_LEVEL 1 - -#ifdef CONFIG_X86_32 -static inline int IO_APIC_irq_trigger(int irq) -{ - int apic, idx, pin; - - for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - idx = find_irq_entry(apic, pin, mp_INT); - if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) - return irq_trigger(idx); - } - } - /* - * nonexistent IRQs are edge default - */ - return 0; -} -#else -static inline int IO_APIC_irq_trigger(int irq) -{ - return 1; -} -#endif - -static void ioapic_register_intr(int irq, unsigned long trigger) -{ - struct irq_desc *desc; - - /* first time to use this irq_desc */ - if (irq < 16) - desc = irq_to_desc(irq); - else - desc = irq_to_desc_alloc(irq); - - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) - desc->status |= IRQ_LEVEL; - else - desc->status &= ~IRQ_LEVEL; - -#ifdef CONFIG_INTR_REMAP - if (irq_remapped(irq)) { - desc->status |= IRQ_MOVE_PCNTXT; - if (trigger) - set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, - handle_fasteoi_irq, - "fasteoi"); - else - set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, - handle_edge_irq, "edge"); - return; - } -#endif - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_fasteoi_irq, - "fasteoi"); - else - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_edge_irq, "edge"); -} - -static int setup_ioapic_entry(int apic, int irq, - struct IO_APIC_route_entry *entry, - unsigned int destination, int trigger, - int polarity, int vector) -{ - /* - * add it to the IO-APIC irq-routing table: - */ - memset(entry,0,sizeof(*entry)); - -#ifdef CONFIG_INTR_REMAP - if (intr_remapping_enabled) { - struct intel_iommu *iommu = map_ioapic_to_ir(apic); - struct irte irte; - struct IR_IO_APIC_route_entry *ir_entry = - (struct IR_IO_APIC_route_entry *) entry; - int index; - - if (!iommu) - panic("No mapping iommu for ioapic %d\n", apic); - - index = alloc_irte(iommu, irq, 1); - if (index < 0) - panic("Failed to allocate IRTE for ioapic %d\n", apic); - - memset(&irte, 0, sizeof(irte)); - - irte.present = 1; - irte.dst_mode = INT_DEST_MODE; - irte.trigger_mode = trigger; - irte.dlvry_mode = INT_DELIVERY_MODE; - irte.vector = vector; - irte.dest_id = IRTE_DEST(destination); - - modify_irte(irq, &irte); - - ir_entry->index2 = (index >> 15) & 0x1; - ir_entry->zero = 0; - ir_entry->format = 1; - ir_entry->index = (index & 0x7fff); - } else -#endif - { - entry->delivery_mode = INT_DELIVERY_MODE; - entry->dest_mode = INT_DEST_MODE; - entry->dest = destination; - } - - entry->mask = 0; /* enable IRQ */ - entry->trigger = trigger; - entry->polarity = polarity; - entry->vector = vector; - - /* Mask level triggered irqs. - * Use IRQ_DELAYED_DISABLE for edge triggered irqs. - */ - if (trigger) - entry->mask = 1; - return 0; -} - -static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, - int trigger, int polarity) -{ - struct irq_cfg *cfg; - struct IO_APIC_route_entry entry; - cpumask_t mask; - - if (!IO_APIC_IRQ(irq)) - return; - - cfg = irq_cfg(irq); - - mask = TARGET_CPUS; - if (assign_irq_vector(irq, mask)) - return; - - cpus_and(mask, cfg->domain, mask); - - apic_printk(APIC_VERBOSE,KERN_DEBUG - "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " - "IRQ %d Mode:%i Active:%i)\n", - apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector, - irq, trigger, polarity); - - - if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry, - cpu_mask_to_apicid(mask), trigger, polarity, - cfg->vector)) { - printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", - mp_ioapics[apic].mp_apicid, pin); - __clear_irq_vector(irq); - return; - } - - ioapic_register_intr(irq, trigger); - if (irq < 16) - disable_8259A_irq(irq); - - ioapic_write_entry(apic, pin, entry); -} - -static void __init setup_IO_APIC_irqs(void) -{ - int apic, pin, idx, irq, first_notcon = 1; - - apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); - - for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - - idx = find_irq_entry(apic,pin,mp_INT); - if (idx == -1) { - if (first_notcon) { - apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin); - first_notcon = 0; - } else - apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin); - continue; - } - if (!first_notcon) { - apic_printk(APIC_VERBOSE, " not connected.\n"); - first_notcon = 1; - } - - irq = pin_2_irq(idx, apic, pin); -#ifdef CONFIG_X86_32 - if (multi_timer_check(apic, irq)) - continue; -#endif - add_pin_to_irq(irq, apic, pin); - - setup_IO_APIC_irq(apic, pin, irq, - irq_trigger(idx), irq_polarity(idx)); - } - } - - if (!first_notcon) - apic_printk(APIC_VERBOSE, " not connected.\n"); -} - -/* - * Set up the timer pin, possibly with the 8259A-master behind. - */ -static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, - int vector) -{ - struct IO_APIC_route_entry entry; - -#ifdef CONFIG_INTR_REMAP - if (intr_remapping_enabled) - return; -#endif - - memset(&entry, 0, sizeof(entry)); - - /* - * We use logical delivery to get the timer IRQ - * to the first CPU. - */ - entry.dest_mode = INT_DEST_MODE; - entry.mask = 1; /* mask IRQ now */ - entry.dest = cpu_mask_to_apicid(TARGET_CPUS); - entry.delivery_mode = INT_DELIVERY_MODE; - entry.polarity = 0; - entry.trigger = 0; - entry.vector = vector; - - /* - * The timer IRQ doesn't have to know that behind the - * scene we may have a 8259A-master in AEOI mode ... - */ - set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); - - /* - * Add it to the IO-APIC irq-routing table: - */ - ioapic_write_entry(apic, pin, entry); -} - - -__apicdebuginit(void) print_IO_APIC(void) -{ - int apic, i; - union IO_APIC_reg_00 reg_00; - union IO_APIC_reg_01 reg_01; - union IO_APIC_reg_02 reg_02; - union IO_APIC_reg_03 reg_03; - unsigned long flags; - struct irq_cfg *cfg; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); - for (i = 0; i < nr_ioapics; i++) - printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", - mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]); - - /* - * We are a bit conservative about what we expect. We have to - * know about every hardware change ASAP. - */ - printk(KERN_INFO "testing the IO APIC.......................\n"); - - for (apic = 0; apic < nr_ioapics; apic++) { - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - reg_01.raw = io_apic_read(apic, 1); - if (reg_01.bits.version >= 0x10) - reg_02.raw = io_apic_read(apic, 2); - if (reg_01.bits.version >= 0x20) - reg_03.raw = io_apic_read(apic, 3); - spin_unlock_irqrestore(&ioapic_lock, flags); - - printk("\n"); - printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); - printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); - printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); - printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); - printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); - - printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); - printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); - - printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); - printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); - - /* - * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, - * but the value of reg_02 is read as the previous read register - * value, so ignore it if reg_02 == reg_01. - */ - if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { - printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); - printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); - } - - /* - * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 - * or reg_03, but the value of reg_0[23] is read as the previous read - * register value, so ignore it if reg_03 == reg_0[12]. - */ - if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && - reg_03.raw != reg_01.raw) { - printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); - printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); - } - - printk(KERN_DEBUG ".... IRQ redirection table:\n"); - - printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" - " Stat Dmod Deli Vect: \n"); - - for (i = 0; i <= reg_01.bits.entries; i++) { - struct IO_APIC_route_entry entry; - - entry = ioapic_read_entry(apic, i); - - printk(KERN_DEBUG " %02x %03X ", - i, - entry.dest - ); - - printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", - entry.mask, - entry.trigger, - entry.irr, - entry.polarity, - entry.delivery_status, - entry.dest_mode, - entry.delivery_mode, - entry.vector - ); - } - } - printk(KERN_DEBUG "IRQ to pin mappings:\n"); - for_each_irq_cfg(cfg) { - struct irq_pin_list *entry = cfg->irq_2_pin; - if (!entry) - continue; - printk(KERN_DEBUG "IRQ%d ", cfg->irq); - for (;;) { - printk("-> %d:%d", entry->apic, entry->pin); - if (!entry->next) - break; - entry = entry->next; - } - printk("\n"); - } - - printk(KERN_INFO ".................................... done.\n"); - - return; -} - -__apicdebuginit(void) print_APIC_bitfield(int base) -{ - unsigned int v; - int i, j; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); - for (i = 0; i < 8; i++) { - v = apic_read(base + i*0x10); - for (j = 0; j < 32; j++) { - if (v & (1< 3) /* Due to the Pentium erratum 3AP. */ - apic_write(APIC_ESR, 0); - - v = apic_read(APIC_ESR); - printk(KERN_DEBUG "... APIC ESR: %08x\n", v); - } - - icr = apic_icr_read(); - printk(KERN_DEBUG "... APIC ICR: %08x\n", icr); - printk(KERN_DEBUG "... APIC ICR2: %08x\n", icr >> 32); - - v = apic_read(APIC_LVTT); - printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); - - if (maxlvt > 3) { /* PC is LVT#4. */ - v = apic_read(APIC_LVTPC); - printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); - } - v = apic_read(APIC_LVT0); - printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); - v = apic_read(APIC_LVT1); - printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); - - if (maxlvt > 2) { /* ERR is LVT#3. */ - v = apic_read(APIC_LVTERR); - printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); - } - - v = apic_read(APIC_TMICT); - printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); - v = apic_read(APIC_TMCCT); - printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); - v = apic_read(APIC_TDCR); - printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); - printk("\n"); -} - -__apicdebuginit(void) print_all_local_APICs(void) -{ - on_each_cpu(print_local_APIC, NULL, 1); -} - -__apicdebuginit(void) print_PIC(void) -{ - unsigned int v; - unsigned long flags; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "\nprinting PIC contents\n"); - - spin_lock_irqsave(&i8259A_lock, flags); - - v = inb(0xa1) << 8 | inb(0x21); - printk(KERN_DEBUG "... PIC IMR: %04x\n", v); - - v = inb(0xa0) << 8 | inb(0x20); - printk(KERN_DEBUG "... PIC IRR: %04x\n", v); - - outb(0x0b,0xa0); - outb(0x0b,0x20); - v = inb(0xa0) << 8 | inb(0x20); - outb(0x0a,0xa0); - outb(0x0a,0x20); - - spin_unlock_irqrestore(&i8259A_lock, flags); - - printk(KERN_DEBUG "... PIC ISR: %04x\n", v); - - v = inb(0x4d1) << 8 | inb(0x4d0); - printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); -} - -__apicdebuginit(int) print_all_ICs(void) -{ - print_PIC(); - print_all_local_APICs(); - print_IO_APIC(); - - return 0; -} - -fs_initcall(print_all_ICs); - - -/* Where if anywhere is the i8259 connect in external int mode */ -static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; - -void __init enable_IO_APIC(void) -{ - union IO_APIC_reg_01 reg_01; - int i8259_apic, i8259_pin; - int apic; - unsigned long flags; - -#ifdef CONFIG_X86_32 - int i; - if (!pirqs_enabled) - for (i = 0; i < MAX_PIRQS; i++) - pirq_entries[i] = -1; -#endif - - /* - * The number of IO-APIC IRQ registers (== #pins): - */ - for (apic = 0; apic < nr_ioapics; apic++) { - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(apic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - nr_ioapic_registers[apic] = reg_01.bits.entries+1; - } - for(apic = 0; apic < nr_ioapics; apic++) { - int pin; - /* See if any of the pins is in ExtINT mode */ - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - struct IO_APIC_route_entry entry; - entry = ioapic_read_entry(apic, pin); - - /* If the interrupt line is enabled and in ExtInt mode - * I have found the pin where the i8259 is connected. - */ - if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { - ioapic_i8259.apic = apic; - ioapic_i8259.pin = pin; - goto found_i8259; - } - } - } - found_i8259: - /* Look to see what if the MP table has reported the ExtINT */ - /* If we could not find the appropriate pin by looking at the ioapic - * the i8259 probably is not connected the ioapic but give the - * mptable a chance anyway. - */ - i8259_pin = find_isa_irq_pin(0, mp_ExtINT); - i8259_apic = find_isa_irq_apic(0, mp_ExtINT); - /* Trust the MP table if nothing is setup in the hardware */ - if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { - printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); - ioapic_i8259.pin = i8259_pin; - ioapic_i8259.apic = i8259_apic; - } - /* Complain if the MP table and the hardware disagree */ - if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && - (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) - { - printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); - } - - /* - * Do not trust the IO-APIC being empty at bootup - */ - clear_IO_APIC(); -} - -/* - * Not an __init, needed by the reboot code - */ -void disable_IO_APIC(void) -{ - /* - * Clear the IO-APIC before rebooting: - */ - clear_IO_APIC(); - - /* - * If the i8259 is routed through an IOAPIC - * Put that IOAPIC in virtual wire mode - * so legacy interrupts can be delivered. - */ - if (ioapic_i8259.pin != -1) { - struct IO_APIC_route_entry entry; - - memset(&entry, 0, sizeof(entry)); - entry.mask = 0; /* Enabled */ - entry.trigger = 0; /* Edge */ - entry.irr = 0; - entry.polarity = 0; /* High */ - entry.delivery_status = 0; - entry.dest_mode = 0; /* Physical */ - entry.delivery_mode = dest_ExtINT; /* ExtInt */ - entry.vector = 0; - entry.dest = read_apic_id(); - - /* - * Add it to the IO-APIC irq-routing table: - */ - ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); - } - - disconnect_bsp_APIC(ioapic_i8259.pin != -1); -} - -#ifdef CONFIG_X86_32 -/* - * function to set the IO-APIC physical IDs based on the - * values stored in the MPC table. - * - * by Matt Domsch Tue Dec 21 12:25:05 CST 1999 - */ - -static void __init setup_ioapic_ids_from_mpc(void) -{ - union IO_APIC_reg_00 reg_00; - physid_mask_t phys_id_present_map; - int apic; - int i; - unsigned char old_id; - unsigned long flags; - - if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids()) - return; - - /* - * Don't check I/O APIC IDs for xAPIC systems. They have - * no meaning without the serial APIC bus. - */ - if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) - return; - /* - * This is broken; anything with a real cpu count has to - * circumvent this idiocy regardless. - */ - phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); - - /* - * Set the IOAPIC ID to the value stored in the MPC table. - */ - for (apic = 0; apic < nr_ioapics; apic++) { - - /* Read the register 0 value */ - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - old_id = mp_ioapics[apic].mp_apicid; - - if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) { - printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", - apic, mp_ioapics[apic].mp_apicid); - printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", - reg_00.bits.ID); - mp_ioapics[apic].mp_apicid = reg_00.bits.ID; - } - - /* - * Sanity check, is the ID really free? Every APIC in a - * system must have a unique ID or we get lots of nice - * 'stuck on smp_invalidate_needed IPI wait' messages. - */ - if (check_apicid_used(phys_id_present_map, - mp_ioapics[apic].mp_apicid)) { - printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", - apic, mp_ioapics[apic].mp_apicid); - for (i = 0; i < get_physical_broadcast(); i++) - if (!physid_isset(i, phys_id_present_map)) - break; - if (i >= get_physical_broadcast()) - panic("Max APIC ID exceeded!\n"); - printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", - i); - physid_set(i, phys_id_present_map); - mp_ioapics[apic].mp_apicid = i; - } else { - physid_mask_t tmp; - tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid); - apic_printk(APIC_VERBOSE, "Setting %d in the " - "phys_id_present_map\n", - mp_ioapics[apic].mp_apicid); - physids_or(phys_id_present_map, phys_id_present_map, tmp); - } - - - /* - * We need to adjust the IRQ routing table - * if the ID changed. - */ - if (old_id != mp_ioapics[apic].mp_apicid) - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mp_dstapic == old_id) - mp_irqs[i].mp_dstapic - = mp_ioapics[apic].mp_apicid; - - /* - * Read the right value from the MPC table and - * write it into the ID register. - */ - apic_printk(APIC_VERBOSE, KERN_INFO - "...changing IO-APIC physical APIC ID to %d ...", - mp_ioapics[apic].mp_apicid); - - reg_00.bits.ID = mp_ioapics[apic].mp_apicid; - spin_lock_irqsave(&ioapic_lock, flags); - - /* - * Sanity check - */ - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid) - printk("could not set ID!\n"); - else - apic_printk(APIC_VERBOSE, " ok.\n"); - } -} -#endif - -int no_timer_check __initdata; - -static int __init notimercheck(char *s) -{ - no_timer_check = 1; - return 1; -} -__setup("no_timer_check", notimercheck); - -/* - * There is a nasty bug in some older SMP boards, their mptable lies - * about the timer IRQ. We do the following to work around the situation: - * - * - timer IRQ defaults to IO-APIC IRQ - * - if this function detects that timer IRQs are defunct, then we fall - * back to ISA timer IRQs - */ -static int __init timer_irq_works(void) -{ - unsigned long t1 = jiffies; - unsigned long flags; - - if (no_timer_check) - return 1; - - local_save_flags(flags); - local_irq_enable(); - /* Let ten ticks pass... */ - mdelay((10 * 1000) / HZ); - local_irq_restore(flags); - - /* - * Expect a few ticks at least, to be sure some possible - * glue logic does not lock up after one or two first - * ticks in a non-ExtINT mode. Also the local APIC - * might have cached one ExtINT interrupt. Finally, at - * least one tick may be lost due to delays. - */ - - /* jiffies wrap? */ - if (time_after(jiffies, t1 + 4)) - return 1; - return 0; -} - -/* - * In the SMP+IOAPIC case it might happen that there are an unspecified - * number of pending IRQ events unhandled. These cases are very rare, - * so we 'resend' these IRQs via IPIs, to the same CPU. It's much - * better to do it this way as thus we do not have to be aware of - * 'pending' interrupts in the IRQ path, except at this point. - */ -/* - * Edge triggered needs to resend any interrupt - * that was delayed but this is now handled in the device - * independent code. - */ - -/* - * Starting up a edge-triggered IO-APIC interrupt is - * nasty - we need to make sure that we get the edge. - * If it is already asserted for some reason, we need - * return 1 to indicate that is was pending. - * - * This is not complete - we should be able to fake - * an edge even if it isn't on the 8259A... - */ - -static unsigned int startup_ioapic_irq(unsigned int irq) -{ - int was_pending = 0; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - if (irq < 16) { - disable_8259A_irq(irq); - if (i8259A_irq_pending(irq)) - was_pending = 1; - } - __unmask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return was_pending; -} - -#ifdef CONFIG_X86_64 -static int ioapic_retrigger_irq(unsigned int irq) -{ - - struct irq_cfg *cfg = irq_cfg(irq); - unsigned long flags; - - spin_lock_irqsave(&vector_lock, flags); - send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector); - spin_unlock_irqrestore(&vector_lock, flags); - - return 1; -} -#else -static int ioapic_retrigger_irq(unsigned int irq) -{ - send_IPI_self(irq_cfg(irq)->vector); - - return 1; -} -#endif - -/* - * Level and edge triggered IO-APIC interrupts need different handling, - * so we use two separate IRQ descriptors. Edge triggered IRQs can be - * handled with the level-triggered descriptor, but that one has slightly - * more overhead. Level-triggered interrupts cannot be handled with the - * edge-triggered handler, without risking IRQ storms and other ugly - * races. - */ - -#ifdef CONFIG_SMP - -#ifdef CONFIG_INTR_REMAP -static void ir_irq_migration(struct work_struct *work); - -static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration); - -/* - * Migrate the IO-APIC irq in the presence of intr-remapping. - * - * For edge triggered, irq migration is a simple atomic update(of vector - * and cpu destination) of IRTE and flush the hardware cache. - * - * For level triggered, we need to modify the io-apic RTE aswell with the update - * vector information, along with modifying IRTE with vector and destination. - * So irq migration for level triggered is little bit more complex compared to - * edge triggered migration. But the good news is, we use the same algorithm - * for level triggered migration as we have today, only difference being, - * we now initiate the irq migration from process context instead of the - * interrupt context. - * - * In future, when we do a directed EOI (combined with cpu EOI broadcast - * suppression) to the IO-APIC, level triggered irq migration will also be - * as simple as edge triggered migration and we can do the irq migration - * with a simple atomic update to IO-APIC RTE. - */ -static void migrate_ioapic_irq(int irq, cpumask_t mask) -{ - struct irq_cfg *cfg; - struct irq_desc *desc; - cpumask_t tmp, cleanup_mask; - struct irte irte; - int modify_ioapic_rte; - unsigned int dest; - unsigned long flags; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; - - if (get_irte(irq, &irte)) - return; - - if (assign_irq_vector(irq, mask)) - return; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - - desc = irq_to_desc(irq); - modify_ioapic_rte = desc->status & IRQ_LEVEL; - if (modify_ioapic_rte) { - spin_lock_irqsave(&ioapic_lock, flags); - __target_IO_APIC_irq(irq, dest, cfg->vector); - spin_unlock_irqrestore(&ioapic_lock, flags); - } - - irte.vector = cfg->vector; - irte.dest_id = IRTE_DEST(dest); - - /* - * Modified the IRTE and flushes the Interrupt entry cache. - */ - modify_irte(irq, &irte); - - if (cfg->move_in_progress) { - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); - cfg->move_cleanup_count = cpus_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - cfg->move_in_progress = 0; - } - - desc->affinity = mask; -} - -static int migrate_irq_remapped_level(int irq) -{ - int ret = -1; - struct irq_desc *desc = irq_to_desc(irq); - - mask_IO_APIC_irq(irq); - - if (io_apic_level_ack_pending(irq)) { - /* - * Interrupt in progress. Migrating irq now will change the - * vector information in the IO-APIC RTE and that will confuse - * the EOI broadcast performed by cpu. - * So, delay the irq migration to the next instance. - */ - schedule_delayed_work(&ir_migration_work, 1); - goto unmask; - } - - /* everthing is clear. we have right of way */ - migrate_ioapic_irq(irq, desc->pending_mask); - - ret = 0; - desc->status &= ~IRQ_MOVE_PENDING; - cpus_clear(desc->pending_mask); - -unmask: - unmask_IO_APIC_irq(irq); - return ret; -} - -static void ir_irq_migration(struct work_struct *work) -{ - unsigned int irq; - struct irq_desc *desc; - - for_each_irq_desc(irq, desc) { - if (desc->status & IRQ_MOVE_PENDING) { - unsigned long flags; - - spin_lock_irqsave(&desc->lock, flags); - if (!desc->chip->set_affinity || - !(desc->status & IRQ_MOVE_PENDING)) { - desc->status &= ~IRQ_MOVE_PENDING; - spin_unlock_irqrestore(&desc->lock, flags); - continue; - } - - desc->chip->set_affinity(irq, desc->pending_mask); - spin_unlock_irqrestore(&desc->lock, flags); - } - } -} - -/* - * Migrates the IRQ destination in the process context. - */ -static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) -{ - struct irq_desc *desc = irq_to_desc(irq); - - if (desc->status & IRQ_LEVEL) { - desc->status |= IRQ_MOVE_PENDING; - desc->pending_mask = mask; - migrate_irq_remapped_level(irq); - return; - } - - migrate_ioapic_irq(irq, mask); -} -#endif - -asmlinkage void smp_irq_move_cleanup_interrupt(void) -{ - unsigned vector, me; - ack_APIC_irq(); -#ifdef CONFIG_X86_64 - exit_idle(); -#endif - irq_enter(); - - me = smp_processor_id(); - for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { - unsigned int irq; - struct irq_desc *desc; - struct irq_cfg *cfg; - irq = __get_cpu_var(vector_irq)[vector]; - - desc = irq_to_desc(irq); - if (!desc) - continue; - - cfg = irq_cfg(irq); - spin_lock(&desc->lock); - if (!cfg->move_cleanup_count) - goto unlock; - - if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) - goto unlock; - - __get_cpu_var(vector_irq)[vector] = -1; - cfg->move_cleanup_count--; -unlock: - spin_unlock(&desc->lock); - } - - irq_exit(); -} - -static void irq_complete_move(unsigned int irq) -{ - struct irq_cfg *cfg = irq_cfg(irq); - unsigned vector, me; - - if (likely(!cfg->move_in_progress)) - return; - - vector = ~get_irq_regs()->orig_ax; - me = smp_processor_id(); - if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { - cpumask_t cleanup_mask; - - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); - cfg->move_cleanup_count = cpus_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - cfg->move_in_progress = 0; - } -} -#else -static inline void irq_complete_move(unsigned int irq) {} -#endif -#ifdef CONFIG_INTR_REMAP -static void ack_x2apic_level(unsigned int irq) -{ - ack_x2APIC_irq(); -} - -static void ack_x2apic_edge(unsigned int irq) -{ - ack_x2APIC_irq(); -} -#endif - -static void ack_apic_edge(unsigned int irq) -{ - irq_complete_move(irq); - move_native_irq(irq); - ack_APIC_irq(); -} - -#ifdef CONFIG_X86_64 -static void ack_apic_level(unsigned int irq) -{ - int do_unmask_irq = 0; - - irq_complete_move(irq); -#ifdef CONFIG_GENERIC_PENDING_IRQ - /* If we are moving the irq we need to mask it */ - if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { - do_unmask_irq = 1; - mask_IO_APIC_irq(irq); - } -#endif - - /* - * We must acknowledge the irq before we move it or the acknowledge will - * not propagate properly. - */ - ack_APIC_irq(); - - /* Now we can move and renable the irq */ - if (unlikely(do_unmask_irq)) { - /* Only migrate the irq if the ack has been received. - * - * On rare occasions the broadcast level triggered ack gets - * delayed going to ioapics, and if we reprogram the - * vector while Remote IRR is still set the irq will never - * fire again. - * - * To prevent this scenario we read the Remote IRR bit - * of the ioapic. This has two effects. - * - On any sane system the read of the ioapic will - * flush writes (and acks) going to the ioapic from - * this cpu. - * - We get to see if the ACK has actually been delivered. - * - * Based on failed experiments of reprogramming the - * ioapic entry from outside of irq context starting - * with masking the ioapic entry and then polling until - * Remote IRR was clear before reprogramming the - * ioapic I don't trust the Remote IRR bit to be - * completey accurate. - * - * However there appears to be no other way to plug - * this race, so if the Remote IRR bit is not - * accurate and is causing problems then it is a hardware bug - * and you can go talk to the chipset vendor about it. - */ - if (!io_apic_level_ack_pending(irq)) - move_masked_irq(irq); - unmask_IO_APIC_irq(irq); - } -} -#else -atomic_t irq_mis_count; -static void ack_apic_level(unsigned int irq) -{ - unsigned long v; - int i; - - irq_complete_move(irq); - move_native_irq(irq); - /* - * It appears there is an erratum which affects at least version 0x11 - * of I/O APIC (that's the 82093AA and cores integrated into various - * chipsets). Under certain conditions a level-triggered interrupt is - * erroneously delivered as edge-triggered one but the respective IRR - * bit gets set nevertheless. As a result the I/O unit expects an EOI - * message but it will never arrive and further interrupts are blocked - * from the source. The exact reason is so far unknown, but the - * phenomenon was observed when two consecutive interrupt requests - * from a given source get delivered to the same CPU and the source is - * temporarily disabled in between. - * - * A workaround is to simulate an EOI message manually. We achieve it - * by setting the trigger mode to edge and then to level when the edge - * trigger mode gets detected in the TMR of a local APIC for a - * level-triggered interrupt. We mask the source for the time of the - * operation to prevent an edge-triggered interrupt escaping meanwhile. - * The idea is from Manfred Spraul. --macro - */ - i = irq_cfg(irq)->vector; - - v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); - - ack_APIC_irq(); - - if (!(v & (1 << (i & 0x1f)))) { - atomic_inc(&irq_mis_count); - spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(irq); - __unmask_and_level_IO_APIC_irq(irq); - spin_unlock(&ioapic_lock); - } -} -#endif - -static struct irq_chip ioapic_chip __read_mostly = { - .name = "IO-APIC", - .startup = startup_ioapic_irq, - .mask = mask_IO_APIC_irq, - .unmask = unmask_IO_APIC_irq, - .ack = ack_apic_edge, - .eoi = ack_apic_level, -#ifdef CONFIG_SMP - .set_affinity = set_ioapic_affinity_irq, -#endif - .retrigger = ioapic_retrigger_irq, -}; - -#ifdef CONFIG_INTR_REMAP -static struct irq_chip ir_ioapic_chip __read_mostly = { - .name = "IR-IO-APIC", - .startup = startup_ioapic_irq, - .mask = mask_IO_APIC_irq, - .unmask = unmask_IO_APIC_irq, - .ack = ack_x2apic_edge, - .eoi = ack_x2apic_level, -#ifdef CONFIG_SMP - .set_affinity = set_ir_ioapic_affinity_irq, -#endif - .retrigger = ioapic_retrigger_irq, -}; -#endif - -static inline void init_IO_APIC_traps(void) -{ - int irq; - struct irq_desc *desc; - struct irq_cfg *cfg; - - /* - * NOTE! The local APIC isn't very good at handling - * multiple interrupts at the same interrupt level. - * As the interrupt level is determined by taking the - * vector number and shifting that right by 4, we - * want to spread these out a bit so that they don't - * all fall in the same interrupt level. - * - * Also, we've got to be careful not to trash gate - * 0x80, because int 0x80 is hm, kind of importantish. ;) - */ - for_each_irq_cfg(cfg) { - irq = cfg->irq; - if (IO_APIC_IRQ(irq) && !cfg->vector) { - /* - * Hmm.. We don't have an entry for this, - * so default to an old-fashioned 8259 - * interrupt if we can.. - */ - if (irq < 16) - make_8259A_irq(irq); - else { - desc = irq_to_desc(irq); - /* Strange. Oh, well.. */ - desc->chip = &no_irq_chip; - } - } - } -} - -/* - * The local APIC irq-chip implementation: - */ - -static void mask_lapic_irq(unsigned int irq) -{ - unsigned long v; - - v = apic_read(APIC_LVT0); - apic_write(APIC_LVT0, v | APIC_LVT_MASKED); -} - -static void unmask_lapic_irq(unsigned int irq) -{ - unsigned long v; - - v = apic_read(APIC_LVT0); - apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); -} - -static void ack_lapic_irq (unsigned int irq) -{ - ack_APIC_irq(); -} - -static struct irq_chip lapic_chip __read_mostly = { - .name = "local-APIC", - .mask = mask_lapic_irq, - .unmask = unmask_lapic_irq, - .ack = ack_lapic_irq, -}; - -static void lapic_register_intr(int irq) -{ - struct irq_desc *desc; - - desc = irq_to_desc(irq); - desc->status &= ~IRQ_LEVEL; - set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, - "edge"); -} - -static void __init setup_nmi(void) -{ - /* - * Dirty trick to enable the NMI watchdog ... - * We put the 8259A master into AEOI mode and - * unmask on all local APICs LVT0 as NMI. - * - * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') - * is from Maciej W. Rozycki - so we do not have to EOI from - * the NMI handler or the timer interrupt. - */ - apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); - - enable_NMI_through_LVT0(); - - apic_printk(APIC_VERBOSE, " done.\n"); -} - -/* - * This looks a bit hackish but it's about the only one way of sending - * a few INTA cycles to 8259As and any associated glue logic. ICR does - * not support the ExtINT mode, unfortunately. We need to send these - * cycles as some i82489DX-based boards have glue logic that keeps the - * 8259A interrupt line asserted until INTA. --macro - */ -static inline void __init unlock_ExtINT_logic(void) -{ - int apic, pin, i; - struct IO_APIC_route_entry entry0, entry1; - unsigned char save_control, save_freq_select; - - pin = find_isa_irq_pin(8, mp_INT); - if (pin == -1) { - WARN_ON_ONCE(1); - return; - } - apic = find_isa_irq_apic(8, mp_INT); - if (apic == -1) { - WARN_ON_ONCE(1); - return; - } - - entry0 = ioapic_read_entry(apic, pin); - clear_IO_APIC_pin(apic, pin); - - memset(&entry1, 0, sizeof(entry1)); - - entry1.dest_mode = 0; /* physical delivery */ - entry1.mask = 0; /* unmask IRQ now */ - entry1.dest = hard_smp_processor_id(); - entry1.delivery_mode = dest_ExtINT; - entry1.polarity = entry0.polarity; - entry1.trigger = 0; - entry1.vector = 0; - - ioapic_write_entry(apic, pin, entry1); - - save_control = CMOS_READ(RTC_CONTROL); - save_freq_select = CMOS_READ(RTC_FREQ_SELECT); - CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, - RTC_FREQ_SELECT); - CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); - - i = 100; - while (i-- > 0) { - mdelay(10); - if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) - i -= 10; - } - - CMOS_WRITE(save_control, RTC_CONTROL); - CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); - clear_IO_APIC_pin(apic, pin); - - ioapic_write_entry(apic, pin, entry0); -} - -static int disable_timer_pin_1 __initdata; -/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ -static int __init disable_timer_pin_setup(char *arg) -{ - disable_timer_pin_1 = 1; - return 0; -} -early_param("disable_timer_pin_1", disable_timer_pin_setup); - -int timer_through_8259 __initdata; - -/* - * This code may look a bit paranoid, but it's supposed to cooperate with - * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ - * is so screwy. Thanks to Brian Perkins for testing/hacking this beast - * fanatically on his truly buggy board. - * - * FIXME: really need to revamp this for all platforms. - */ -static inline void __init check_timer(void) -{ - struct irq_cfg *cfg = irq_cfg(0); - int apic1, pin1, apic2, pin2; - unsigned long flags; - unsigned int ver; - int no_pin1 = 0; - - local_irq_save(flags); - - ver = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(ver); - - /* - * get/set the timer IRQ vector: - */ - disable_8259A_irq(0); - assign_irq_vector(0, TARGET_CPUS); - - /* - * As IRQ0 is to be enabled in the 8259A, the virtual - * wire has to be disabled in the local APIC. Also - * timer interrupts need to be acknowledged manually in - * the 8259A for the i82489DX when using the NMI - * watchdog as that APIC treats NMIs as level-triggered. - * The AEOI mode will finish them in the 8259A - * automatically. - */ - apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); - init_8259A(1); -#ifdef CONFIG_X86_32 - timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); -#endif - - pin1 = find_isa_irq_pin(0, mp_INT); - apic1 = find_isa_irq_apic(0, mp_INT); - pin2 = ioapic_i8259.pin; - apic2 = ioapic_i8259.apic; - - apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " - "apic1=%d pin1=%d apic2=%d pin2=%d\n", - cfg->vector, apic1, pin1, apic2, pin2); - - /* - * Some BIOS writers are clueless and report the ExtINTA - * I/O APIC input from the cascaded 8259A as the timer - * interrupt input. So just in case, if only one pin - * was found above, try it both directly and through the - * 8259A. - */ - if (pin1 == -1) { -#ifdef CONFIG_INTR_REMAP - if (intr_remapping_enabled) - panic("BIOS bug: timer not connected to IO-APIC"); -#endif - pin1 = pin2; - apic1 = apic2; - no_pin1 = 1; - } else if (pin2 == -1) { - pin2 = pin1; - apic2 = apic1; - } - - if (pin1 != -1) { - /* - * Ok, does IRQ0 through the IOAPIC work? - */ - if (no_pin1) { - add_pin_to_irq(0, apic1, pin1); - setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); - } - unmask_IO_APIC_irq(0); - if (timer_irq_works()) { - if (nmi_watchdog == NMI_IO_APIC) { - setup_nmi(); - enable_8259A_irq(0); - } - if (disable_timer_pin_1 > 0) - clear_IO_APIC_pin(0, pin1); - goto out; - } -#ifdef CONFIG_INTR_REMAP - if (intr_remapping_enabled) - panic("timer doesn't work through Interrupt-remapped IO-APIC"); -#endif - clear_IO_APIC_pin(apic1, pin1); - if (!no_pin1) - apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " - "8254 timer not connected to IO-APIC\n"); - - apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer " - "(IRQ0) through the 8259A ...\n"); - apic_printk(APIC_QUIET, KERN_INFO - "..... (found apic %d pin %d) ...\n", apic2, pin2); - /* - * legacy devices should be connected to IO APIC #0 - */ - replace_pin_at_irq(0, apic1, pin1, apic2, pin2); - setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); - unmask_IO_APIC_irq(0); - enable_8259A_irq(0); - if (timer_irq_works()) { - apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); - timer_through_8259 = 1; - if (nmi_watchdog == NMI_IO_APIC) { - disable_8259A_irq(0); - setup_nmi(); - enable_8259A_irq(0); - } - goto out; - } - /* - * Cleanup, just in case ... - */ - disable_8259A_irq(0); - clear_IO_APIC_pin(apic2, pin2); - apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); - } - - if (nmi_watchdog == NMI_IO_APIC) { - apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work " - "through the IO-APIC - disabling NMI Watchdog!\n"); - nmi_watchdog = NMI_NONE; - } -#ifdef CONFIG_X86_32 - timer_ack = 0; -#endif - - apic_printk(APIC_QUIET, KERN_INFO - "...trying to set up timer as Virtual Wire IRQ...\n"); - - lapic_register_intr(0); - apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ - enable_8259A_irq(0); - - if (timer_irq_works()) { - apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); - goto out; - } - disable_8259A_irq(0); - apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); - apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); - - apic_printk(APIC_QUIET, KERN_INFO - "...trying to set up timer as ExtINT IRQ...\n"); - - init_8259A(0); - make_8259A_irq(0); - apic_write(APIC_LVT0, APIC_DM_EXTINT); - - unlock_ExtINT_logic(); - - if (timer_irq_works()) { - apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); - goto out; - } - apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); - panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " - "report. Then try booting with the 'noapic' option.\n"); -out: - local_irq_restore(flags); -} - -/* - * Traditionally ISA IRQ2 is the cascade IRQ, and is not available - * to devices. However there may be an I/O APIC pin available for - * this interrupt regardless. The pin may be left unconnected, but - * typically it will be reused as an ExtINT cascade interrupt for - * the master 8259A. In the MPS case such a pin will normally be - * reported as an ExtINT interrupt in the MP table. With ACPI - * there is no provision for ExtINT interrupts, and in the absence - * of an override it would be treated as an ordinary ISA I/O APIC - * interrupt, that is edge-triggered and unmasked by default. We - * used to do this, but it caused problems on some systems because - * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using - * the same ExtINT cascade interrupt to drive the local APIC of the - * bootstrap processor. Therefore we refrain from routing IRQ2 to - * the I/O APIC in all cases now. No actual device should request - * it anyway. --macro - */ -#define PIC_IRQS (1 << PIC_CASCADE_IR) - -void __init setup_IO_APIC(void) -{ - -#ifdef CONFIG_X86_32 - enable_IO_APIC(); -#else - /* - * calling enable_IO_APIC() is moved to setup_local_APIC for BP - */ -#endif - - io_apic_irqs = ~PIC_IRQS; - - apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); - /* - * Set up IO-APIC IRQ routing. - */ -#ifdef CONFIG_X86_32 - if (!acpi_ioapic) - setup_ioapic_ids_from_mpc(); -#endif - sync_Arb_IDs(); - setup_IO_APIC_irqs(); - init_IO_APIC_traps(); - check_timer(); -} - -/* - * Called after all the initialization is done. If we didnt find any - * APIC bugs then we can allow the modify fast path - */ - -static int __init io_apic_bug_finalize(void) -{ - if (sis_apic_bug == -1) - sis_apic_bug = 0; - return 0; -} - -late_initcall(io_apic_bug_finalize); - -struct sysfs_ioapic_data { - struct sys_device dev; - struct IO_APIC_route_entry entry[0]; -}; -static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; - -static int ioapic_suspend(struct sys_device *dev, pm_message_t state) -{ - struct IO_APIC_route_entry *entry; - struct sysfs_ioapic_data *data; - int i; - - data = container_of(dev, struct sysfs_ioapic_data, dev); - entry = data->entry; - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) - *entry = ioapic_read_entry(dev->id, i); - - return 0; -} - -static int ioapic_resume(struct sys_device *dev) -{ - struct IO_APIC_route_entry *entry; - struct sysfs_ioapic_data *data; - unsigned long flags; - union IO_APIC_reg_00 reg_00; - int i; - - data = container_of(dev, struct sysfs_ioapic_data, dev); - entry = data->entry; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(dev->id, 0); - if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) { - reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid; - io_apic_write(dev->id, 0, reg_00.raw); - } - spin_unlock_irqrestore(&ioapic_lock, flags); - for (i = 0; i < nr_ioapic_registers[dev->id]; i++) - ioapic_write_entry(dev->id, i, entry[i]); - - return 0; -} - -static struct sysdev_class ioapic_sysdev_class = { - .name = "ioapic", - .suspend = ioapic_suspend, - .resume = ioapic_resume, -}; - -static int __init ioapic_init_sysfs(void) -{ - struct sys_device * dev; - int i, size, error; - - error = sysdev_class_register(&ioapic_sysdev_class); - if (error) - return error; - - for (i = 0; i < nr_ioapics; i++ ) { - size = sizeof(struct sys_device) + nr_ioapic_registers[i] - * sizeof(struct IO_APIC_route_entry); - mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL); - if (!mp_ioapic_data[i]) { - printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); - continue; - } - dev = &mp_ioapic_data[i]->dev; - dev->id = i; - dev->cls = &ioapic_sysdev_class; - error = sysdev_register(dev); - if (error) { - kfree(mp_ioapic_data[i]); - mp_ioapic_data[i] = NULL; - printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); - continue; - } - } - - return 0; -} - -device_initcall(ioapic_init_sysfs); - -/* - * Dynamic irq allocate and deallocation - */ -unsigned int create_irq_nr(unsigned int irq_want) -{ - /* Allocate an unused irq */ - unsigned int irq; - unsigned int new; - unsigned long flags; - struct irq_cfg *cfg_new; - -#ifndef CONFIG_HAVE_SPARSE_IRQ - irq_want = nr_irqs - 1; -#endif - - irq = 0; - spin_lock_irqsave(&vector_lock, flags); - for (new = irq_want; new > 0; new--) { - if (platform_legacy_irq(new)) - continue; - cfg_new = irq_cfg(new); - if (cfg_new && cfg_new->vector != 0) - continue; - /* check if need to create one */ - if (!cfg_new) - cfg_new = irq_cfg_alloc(new); - if (__assign_irq_vector(new, TARGET_CPUS) == 0) - irq = new; - break; - } - spin_unlock_irqrestore(&vector_lock, flags); - - if (irq > 0) { - dynamic_irq_init(irq); - } - return irq; -} - -int create_irq(void) -{ - int irq; - - irq = create_irq_nr(nr_irqs - 1); - - if (irq == 0) - irq = -1; - - return irq; -} - -void destroy_irq(unsigned int irq) -{ - unsigned long flags; - - dynamic_irq_cleanup(irq); - -#ifdef CONFIG_INTR_REMAP - free_irte(irq); -#endif - spin_lock_irqsave(&vector_lock, flags); - __clear_irq_vector(irq); - spin_unlock_irqrestore(&vector_lock, flags); -} - -/* - * MSI message composition - */ -#ifdef CONFIG_PCI_MSI -static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) -{ - struct irq_cfg *cfg; - int err; - unsigned dest; - cpumask_t tmp; - - tmp = TARGET_CPUS; - err = assign_irq_vector(irq, tmp); - if (err) - return err; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, tmp); - dest = cpu_mask_to_apicid(tmp); - -#ifdef CONFIG_INTR_REMAP - if (irq_remapped(irq)) { - struct irte irte; - int ir_index; - u16 sub_handle; - - ir_index = map_irq_to_irte_handle(irq, &sub_handle); - BUG_ON(ir_index == -1); - - memset (&irte, 0, sizeof(irte)); - - irte.present = 1; - irte.dst_mode = INT_DEST_MODE; - irte.trigger_mode = 0; /* edge */ - irte.dlvry_mode = INT_DELIVERY_MODE; - irte.vector = cfg->vector; - irte.dest_id = IRTE_DEST(dest); - - modify_irte(irq, &irte); - - msg->address_hi = MSI_ADDR_BASE_HI; - msg->data = sub_handle; - msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT | - MSI_ADDR_IR_SHV | - MSI_ADDR_IR_INDEX1(ir_index) | - MSI_ADDR_IR_INDEX2(ir_index); - } else -#endif - { - msg->address_hi = MSI_ADDR_BASE_HI; - msg->address_lo = - MSI_ADDR_BASE_LO | - ((INT_DEST_MODE == 0) ? - MSI_ADDR_DEST_MODE_PHYSICAL: - MSI_ADDR_DEST_MODE_LOGICAL) | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? - MSI_ADDR_REDIRECTION_CPU: - MSI_ADDR_REDIRECTION_LOWPRI) | - MSI_ADDR_DEST_ID(dest); - - msg->data = - MSI_DATA_TRIGGER_EDGE | - MSI_DATA_LEVEL_ASSERT | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? - MSI_DATA_DELIVERY_FIXED: - MSI_DATA_DELIVERY_LOWPRI) | - MSI_DATA_VECTOR(cfg->vector); - } - return err; -} - -#ifdef CONFIG_SMP -static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) -{ - struct irq_cfg *cfg; - struct msi_msg msg; - unsigned int dest; - cpumask_t tmp; - struct irq_desc *desc; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; - - if (assign_irq_vector(irq, mask)) - return; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - - read_msi_msg(irq, &msg); - - msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(cfg->vector); - msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg.address_lo |= MSI_ADDR_DEST_ID(dest); - - write_msi_msg(irq, &msg); - desc = irq_to_desc(irq); - desc->affinity = mask; -} - -#ifdef CONFIG_INTR_REMAP -/* - * Migrate the MSI irq to another cpumask. This migration is - * done in the process context using interrupt-remapping hardware. - */ -static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) -{ - struct irq_cfg *cfg; - unsigned int dest; - cpumask_t tmp, cleanup_mask; - struct irte irte; - struct irq_desc *desc; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; - - if (get_irte(irq, &irte)) - return; - - if (assign_irq_vector(irq, mask)) - return; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - - irte.vector = cfg->vector; - irte.dest_id = IRTE_DEST(dest); - - /* - * atomically update the IRTE with the new destination and vector. - */ - modify_irte(irq, &irte); - - /* - * After this point, all the interrupts will start arriving - * at the new destination. So, time to cleanup the previous - * vector allocation. - */ - if (cfg->move_in_progress) { - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); - cfg->move_cleanup_count = cpus_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - cfg->move_in_progress = 0; - } - - desc = irq_to_desc(irq); - desc->affinity = mask; -} -#endif -#endif /* CONFIG_SMP */ - -/* - * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, - * which implement the MSI or MSI-X Capability Structure. - */ -static struct irq_chip msi_chip = { - .name = "PCI-MSI", - .unmask = unmask_msi_irq, - .mask = mask_msi_irq, - .ack = ack_apic_edge, -#ifdef CONFIG_SMP - .set_affinity = set_msi_irq_affinity, -#endif - .retrigger = ioapic_retrigger_irq, -}; - -#ifdef CONFIG_INTR_REMAP -static struct irq_chip msi_ir_chip = { - .name = "IR-PCI-MSI", - .unmask = unmask_msi_irq, - .mask = mask_msi_irq, - .ack = ack_x2apic_edge, -#ifdef CONFIG_SMP - .set_affinity = ir_set_msi_irq_affinity, -#endif - .retrigger = ioapic_retrigger_irq, -}; - -/* - * Map the PCI dev to the corresponding remapping hardware unit - * and allocate 'nvec' consecutive interrupt-remapping table entries - * in it. - */ -static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) -{ - struct intel_iommu *iommu; - int index; - - iommu = map_dev_to_ir(dev); - if (!iommu) { - printk(KERN_ERR - "Unable to map PCI %s to iommu\n", pci_name(dev)); - return -ENOENT; - } - - index = alloc_irte(iommu, irq, nvec); - if (index < 0) { - printk(KERN_ERR - "Unable to allocate %d IRTE for PCI %s\n", nvec, - pci_name(dev)); - return -ENOSPC; - } - return index; -} -#endif - -static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) -{ - int ret; - struct msi_msg msg; - - ret = msi_compose_msg(dev, irq, &msg); - if (ret < 0) - return ret; - - set_irq_msi(irq, desc); - write_msi_msg(irq, &msg); - -#ifdef CONFIG_INTR_REMAP - if (irq_remapped(irq)) { - struct irq_desc *desc = irq_to_desc(irq); - /* - * irq migration in process context - */ - desc->status |= IRQ_MOVE_PCNTXT; - set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); - } else -#endif - set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); - - return 0; -} - -static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) -{ - unsigned int irq; - - irq = dev->bus->number; - irq <<= 8; - irq |= dev->devfn; - irq <<= 12; - - return irq; -} - -int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) -{ - unsigned int irq; - int ret; - unsigned int irq_want; - - irq_want = build_irq_for_pci_dev(dev) + 0x100; - - irq = create_irq_nr(irq_want); - if (irq == 0) - return -1; - -#ifdef CONFIG_INTR_REMAP - if (!intr_remapping_enabled) - goto no_ir; - - ret = msi_alloc_irte(dev, irq, 1); - if (ret < 0) - goto error; -no_ir: -#endif - ret = setup_msi_irq(dev, desc, irq); - if (ret < 0) { - destroy_irq(irq); - return ret; - } - return 0; - -#ifdef CONFIG_INTR_REMAP -error: - destroy_irq(irq); - return ret; -#endif -} - -int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) -{ - unsigned int irq; - int ret, sub_handle; - struct msi_desc *desc; - unsigned int irq_want; - -#ifdef CONFIG_INTR_REMAP - struct intel_iommu *iommu = 0; - int index = 0; -#endif - - irq_want = build_irq_for_pci_dev(dev) + 0x100; - sub_handle = 0; - list_for_each_entry(desc, &dev->msi_list, list) { - irq = create_irq_nr(irq_want--); - if (irq == 0) - return -1; -#ifdef CONFIG_INTR_REMAP - if (!intr_remapping_enabled) - goto no_ir; - - if (!sub_handle) { - /* - * allocate the consecutive block of IRTE's - * for 'nvec' - */ - index = msi_alloc_irte(dev, irq, nvec); - if (index < 0) { - ret = index; - goto error; - } - } else { - iommu = map_dev_to_ir(dev); - if (!iommu) { - ret = -ENOENT; - goto error; - } - /* - * setup the mapping between the irq and the IRTE - * base index, the sub_handle pointing to the - * appropriate interrupt remap table entry. - */ - set_irte_irq(irq, iommu, index, sub_handle); - } -no_ir: -#endif - ret = setup_msi_irq(dev, desc, irq); - if (ret < 0) - goto error; - sub_handle++; - } - return 0; - -error: - destroy_irq(irq); - return ret; -} - -void arch_teardown_msi_irq(unsigned int irq) -{ - destroy_irq(irq); -} - -#ifdef CONFIG_DMAR -#ifdef CONFIG_SMP -static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) -{ - struct irq_cfg *cfg; - struct msi_msg msg; - unsigned int dest; - cpumask_t tmp; - struct irq_desc *desc; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; - - if (assign_irq_vector(irq, mask)) - return; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - - dmar_msi_read(irq, &msg); - - msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(cfg->vector); - msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg.address_lo |= MSI_ADDR_DEST_ID(dest); - - dmar_msi_write(irq, &msg); - desc = irq_to_desc(irq); - desc->affinity = mask; -} -#endif /* CONFIG_SMP */ - -struct irq_chip dmar_msi_type = { - .name = "DMAR_MSI", - .unmask = dmar_msi_unmask, - .mask = dmar_msi_mask, - .ack = ack_apic_edge, -#ifdef CONFIG_SMP - .set_affinity = dmar_msi_set_affinity, -#endif - .retrigger = ioapic_retrigger_irq, -}; - -int arch_setup_dmar_msi(unsigned int irq) -{ - int ret; - struct msi_msg msg; - - ret = msi_compose_msg(NULL, irq, &msg); - if (ret < 0) - return ret; - dmar_msi_write(irq, &msg); - set_irq_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, - "edge"); - return 0; -} -#endif - -#endif /* CONFIG_PCI_MSI */ -/* - * Hypertransport interrupt support - */ -#ifdef CONFIG_HT_IRQ - -#ifdef CONFIG_SMP - -static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) -{ - struct ht_irq_msg msg; - fetch_ht_irq_msg(irq, &msg); - - msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK); - msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); - - msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest); - msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); - - write_ht_irq_msg(irq, &msg); -} - -static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) -{ - struct irq_cfg *cfg; - unsigned int dest; - cpumask_t tmp; - struct irq_desc *desc; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; - - if (assign_irq_vector(irq, mask)) - return; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - - target_ht_irq(irq, dest, cfg->vector); - desc = irq_to_desc(irq); - desc->affinity = mask; -} -#endif - -static struct irq_chip ht_irq_chip = { - .name = "PCI-HT", - .mask = mask_ht_irq, - .unmask = unmask_ht_irq, - .ack = ack_apic_edge, -#ifdef CONFIG_SMP - .set_affinity = set_ht_irq_affinity, -#endif - .retrigger = ioapic_retrigger_irq, -}; - -int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) -{ - struct irq_cfg *cfg; - int err; - cpumask_t tmp; - - tmp = TARGET_CPUS; - err = assign_irq_vector(irq, tmp); - if (!err) { - struct ht_irq_msg msg; - unsigned dest; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, tmp); - dest = cpu_mask_to_apicid(tmp); - - msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); - - msg.address_lo = - HT_IRQ_LOW_BASE | - HT_IRQ_LOW_DEST_ID(dest) | - HT_IRQ_LOW_VECTOR(cfg->vector) | - ((INT_DEST_MODE == 0) ? - HT_IRQ_LOW_DM_PHYSICAL : - HT_IRQ_LOW_DM_LOGICAL) | - HT_IRQ_LOW_RQEOI_EDGE | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? - HT_IRQ_LOW_MT_FIXED : - HT_IRQ_LOW_MT_ARBITRATED) | - HT_IRQ_LOW_IRQ_MASKED; - - write_ht_irq_msg(irq, &msg); - - set_irq_chip_and_handler_name(irq, &ht_irq_chip, - handle_edge_irq, "edge"); - } - return err; -} -#endif /* CONFIG_HT_IRQ */ - -/* -------------------------------------------------------------------------- - ACPI-based IOAPIC Configuration - -------------------------------------------------------------------------- */ - -#ifdef CONFIG_ACPI - -#ifdef CONFIG_X86_32 -int __init io_apic_get_unique_id(int ioapic, int apic_id) -{ - union IO_APIC_reg_00 reg_00; - static physid_mask_t apic_id_map = PHYSID_MASK_NONE; - physid_mask_t tmp; - unsigned long flags; - int i = 0; - - /* - * The P4 platform supports up to 256 APIC IDs on two separate APIC - * buses (one for LAPICs, one for IOAPICs), where predecessors only - * supports up to 16 on one shared APIC bus. - * - * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full - * advantage of new APIC bus architecture. - */ - - if (physids_empty(apic_id_map)) - apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(ioapic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - if (apic_id >= get_physical_broadcast()) { - printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " - "%d\n", ioapic, apic_id, reg_00.bits.ID); - apic_id = reg_00.bits.ID; - } - - /* - * Every APIC in a system must have a unique ID or we get lots of nice - * 'stuck on smp_invalidate_needed IPI wait' messages. - */ - if (check_apicid_used(apic_id_map, apic_id)) { - - for (i = 0; i < get_physical_broadcast(); i++) { - if (!check_apicid_used(apic_id_map, i)) - break; - } - - if (i == get_physical_broadcast()) - panic("Max apic_id exceeded!\n"); - - printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " - "trying %d\n", ioapic, apic_id, i); - - apic_id = i; - } - - tmp = apicid_to_cpu_present(apic_id); - physids_or(apic_id_map, apic_id_map, tmp); - - if (reg_00.bits.ID != apic_id) { - reg_00.bits.ID = apic_id; - - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic, 0, reg_00.raw); - reg_00.raw = io_apic_read(ioapic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - /* Sanity check */ - if (reg_00.bits.ID != apic_id) { - printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); - return -1; - } - } - - apic_printk(APIC_VERBOSE, KERN_INFO - "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); - - return apic_id; -} - -int __init io_apic_get_version(int ioapic) -{ - union IO_APIC_reg_01 reg_01; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return reg_01.bits.version; -} -#endif - -int __init io_apic_get_redir_entries (int ioapic) -{ - union IO_APIC_reg_01 reg_01; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return reg_01.bits.entries; -} - - -int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) -{ - if (!IO_APIC_IRQ(irq)) { - apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", - ioapic); - return -EINVAL; - } - - /* - * IRQs < 16 are already in the irq_2_pin[] map - */ - if (irq >= 16) - add_pin_to_irq(irq, ioapic, pin); - - setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity); - - return 0; -} - - -int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) -{ - int i; - - if (skip_ioapic_setup) - return -1; - - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mp_irqtype == mp_INT && - mp_irqs[i].mp_srcbusirq == bus_irq) - break; - if (i >= mp_irq_entries) - return -1; - - *trigger = irq_trigger(i); - *polarity = irq_polarity(i); - return 0; -} - -#endif /* CONFIG_ACPI */ - -/* - * This function currently is only a helper for the i386 smp boot process where - * we need to reprogram the ioredtbls to cater for the cpus which have come online - * so mask in all cases should simply be TARGET_CPUS - */ -#ifdef CONFIG_SMP -void __init setup_ioapic_dest(void) -{ - int pin, ioapic, irq, irq_entry; - struct irq_cfg *cfg; - - if (skip_ioapic_setup == 1) - return; - - for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { - for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { - irq_entry = find_irq_entry(ioapic, pin, mp_INT); - if (irq_entry == -1) - continue; - irq = pin_2_irq(irq_entry, ioapic, pin); - - /* setup_IO_APIC_irqs could fail to get vector for some device - * when you have too many devices, because at that time only boot - * cpu is online. - */ - cfg = irq_cfg(irq); - if (!cfg->vector) - setup_IO_APIC_irq(ioapic, pin, irq, - irq_trigger(irq_entry), - irq_polarity(irq_entry)); -#ifdef CONFIG_INTR_REMAP - else if (intr_remapping_enabled) - set_ir_ioapic_affinity_irq(irq, TARGET_CPUS); -#endif - else - set_ioapic_affinity_irq(irq, TARGET_CPUS); - } - - } -} -#endif - -#ifdef CONFIG_X86_64 -#define IOAPIC_RESOURCE_NAME_SIZE 11 - -static struct resource *ioapic_resources; - -static struct resource * __init ioapic_setup_resources(void) -{ - unsigned long n; - struct resource *res; - char *mem; - int i; - - if (nr_ioapics <= 0) - return NULL; - - n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); - n *= nr_ioapics; - - mem = alloc_bootmem(n); - res = (void *)mem; - - if (mem != NULL) { - mem += sizeof(struct resource) * nr_ioapics; - - for (i = 0; i < nr_ioapics; i++) { - res[i].name = mem; - res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; - sprintf(mem, "IOAPIC %u", i); - mem += IOAPIC_RESOURCE_NAME_SIZE; - } - } - - ioapic_resources = res; - - return res; -} -#endif - -void __init ioapic_init_mappings(void) -{ - unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; - int i; -#ifdef CONFIG_X86_64 - struct resource *ioapic_res; - - ioapic_res = ioapic_setup_resources(); -#endif - for (i = 0; i < nr_ioapics; i++) { - if (smp_found_config) { - ioapic_phys = mp_ioapics[i].mp_apicaddr; -#ifdef CONFIG_X86_32 - if (!ioapic_phys) { - printk(KERN_ERR - "WARNING: bogus zero IO-APIC " - "address found in MPTABLE, " - "disabling IO/APIC support!\n"); - smp_found_config = 0; - skip_ioapic_setup = 1; - goto fake_ioapic_page; - } -#endif - } else { -#ifdef CONFIG_X86_32 -fake_ioapic_page: -#endif - ioapic_phys = (unsigned long) - alloc_bootmem_pages(PAGE_SIZE); - ioapic_phys = __pa(ioapic_phys); - } - set_fixmap_nocache(idx, ioapic_phys); - apic_printk(APIC_VERBOSE, - "mapped IOAPIC to %08lx (%08lx)\n", - __fix_to_virt(idx), ioapic_phys); - idx++; - -#ifdef CONFIG_X86_64 - if (ioapic_res != NULL) { - ioapic_res->start = ioapic_phys; - ioapic_res->end = ioapic_phys + (4 * 1024) - 1; - ioapic_res++; - } -#endif - } -} - -#ifdef CONFIG_X86_64 -static int __init ioapic_insert_resources(void) -{ - int i; - struct resource *r = ioapic_resources; - - if (!r) { - printk(KERN_ERR - "IO APIC resources could be not be allocated.\n"); - return -1; - } - - for (i = 0; i < nr_ioapics; i++) { - insert_resource(&iomem_resource, r); - r++; - } - - return 0; -} - -/* Insert the IO APIC resources after PCI initialization has occured to handle - * IO APICS that are mapped in on a BAR in PCI space. */ -late_initcall(ioapic_insert_resources); -#endif diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c deleted file mode 100644 index 940c4167b32..00000000000 --- a/arch/x86/kernel/io_apic_64.c +++ /dev/null @@ -1,3913 +0,0 @@ -/* - * Intel IO-APIC support for multi-Pentium hosts. - * - * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo - * - * Many thanks to Stig Venaas for trying out countless experimental - * patches and reporting/debugging problems patiently! - * - * (c) 1999, Multiple IO-APIC support, developed by - * Ken-ichi Yaku and - * Hidemi Kishimoto , - * further tested and cleaned up by Zach Brown - * and Ingo Molnar - * - * Fixes - * Maciej W. Rozycki : Bits for genuine 82489DX APICs; - * thanks to Eric Gilmore - * and Rolf G. Tews - * for testing these extensively - * Paul Diefenbaugh : Added full ACPI support - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include /* time_after() */ -#ifdef CONFIG_ACPI -#include -#endif -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#define __apicdebuginit(type) static type __init - -/* - * Is the SiS APIC rmw bug present ? - * -1 = don't know, 0 = no, 1 = yes - */ -int sis_apic_bug = -1; - -static DEFINE_SPINLOCK(ioapic_lock); -static DEFINE_SPINLOCK(vector_lock); - -int first_free_entry; -/* - * Rough estimation of how many shared IRQs there are, can - * be changed anytime. - */ -int pin_map_size; - -/* - * # of IRQ routing registers - */ -int nr_ioapic_registers[MAX_IO_APICS]; - -/* I/O APIC entries */ -struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; -int nr_ioapics; - -/* MP IRQ source entries */ -struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; - -/* # of MP IRQ source entries */ -int mp_irq_entries; - -#if defined (CONFIG_MCA) || defined (CONFIG_EISA) -int mp_bus_id_to_type[MAX_MP_BUSSES]; -#endif - -DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); - -int skip_ioapic_setup; - -static int __init parse_noapic(char *str) -{ - /* disable IO-APIC */ - disable_ioapic_setup(); - return 0; -} -early_param("noapic", parse_noapic); - -struct irq_cfg; -struct irq_pin_list; -struct irq_cfg { - unsigned int irq; - struct irq_cfg *next; - struct irq_pin_list *irq_2_pin; - cpumask_t domain; - cpumask_t old_domain; - unsigned move_cleanup_count; - u8 vector; - u8 move_in_progress : 1; -}; - -/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ -static struct irq_cfg irq_cfg_legacy[] __initdata = { - [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, - [1] = { .irq = 1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, - [2] = { .irq = 2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, - [3] = { .irq = 3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, }, - [4] = { .irq = 4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, }, - [5] = { .irq = 5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, }, - [6] = { .irq = 6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, }, - [7] = { .irq = 7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, }, - [8] = { .irq = 8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, }, - [9] = { .irq = 9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, }, - [10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, - [11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, - [12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, - [13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, - [14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, - [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, -}; - -static struct irq_cfg irq_cfg_init = { .irq = -1U, }; -/* need to be biger than size of irq_cfg_legacy */ -static int nr_irq_cfg = 32; - -static int __init parse_nr_irq_cfg(char *arg) -{ - if (arg) { - nr_irq_cfg = simple_strtoul(arg, NULL, 0); - if (nr_irq_cfg < 32) - nr_irq_cfg = 32; - } - return 0; -} - -early_param("nr_irq_cfg", parse_nr_irq_cfg); - -static void init_one_irq_cfg(struct irq_cfg *cfg) -{ - memcpy(cfg, &irq_cfg_init, sizeof(struct irq_cfg)); -} - -static struct irq_cfg *irq_cfgx; -static struct irq_cfg *irq_cfgx_free; -static void __init init_work(void *data) -{ - struct dyn_array *da = data; - struct irq_cfg *cfg; - int legacy_count; - int i; - - cfg = *da->name; - - memcpy(cfg, irq_cfg_legacy, sizeof(irq_cfg_legacy)); - - legacy_count = sizeof(irq_cfg_legacy)/sizeof(irq_cfg_legacy[0]); - for (i = legacy_count; i < *da->nr; i++) - init_one_irq_cfg(&cfg[i]); - - for (i = 1; i < *da->nr; i++) - cfg[i-1].next = &cfg[i]; - - irq_cfgx_free = &irq_cfgx[legacy_count]; - irq_cfgx[legacy_count - 1].next = NULL; -} - -#define for_each_irq_cfg(cfg) \ - for (cfg = irq_cfgx; cfg; cfg = cfg->next) - -DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irq_cfg, PAGE_SIZE, init_work); - -static struct irq_cfg *irq_cfg(unsigned int irq) -{ - struct irq_cfg *cfg; - - cfg = irq_cfgx; - while (cfg) { - if (cfg->irq == irq) - return cfg; - - cfg = cfg->next; - } - - return NULL; -} - -static struct irq_cfg *irq_cfg_alloc(unsigned int irq) -{ - struct irq_cfg *cfg, *cfg_pri; - int i; - int count = 0; - - cfg_pri = cfg = irq_cfgx; - while (cfg) { - if (cfg->irq == irq) - return cfg; - - cfg_pri = cfg; - cfg = cfg->next; - count++; - } - - if (!irq_cfgx_free) { - unsigned long phys; - unsigned long total_bytes; - /* - * we run out of pre-allocate ones, allocate more - */ - printk(KERN_DEBUG "try to get more irq_cfg %d\n", nr_irq_cfg); - - total_bytes = sizeof(struct irq_cfg) * nr_irq_cfg; - if (after_bootmem) - cfg = kzalloc(total_bytes, GFP_ATOMIC); - else - cfg = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0); - - if (!cfg) - panic("please boot with nr_irq_cfg= %d\n", count * 2); - - phys = __pa(cfg); - printk(KERN_DEBUG "irq_irq ==> [%#lx - %#lx]\n", phys, phys + total_bytes); - - for (i = 0; i < nr_irq_cfg; i++) - init_one_irq_cfg(&cfg[i]); - - for (i = 1; i < nr_irq_cfg; i++) - cfg[i-1].next = &cfg[i]; - - irq_cfgx_free = cfg; - } - - cfg = irq_cfgx_free; - irq_cfgx_free = irq_cfgx_free->next; - cfg->next = NULL; - if (cfg_pri) - cfg_pri->next = cfg; - else - irq_cfgx = cfg; - cfg->irq = irq; - printk(KERN_DEBUG "found new irq_cfg for irq %d\n", cfg->irq); -#ifdef CONFIG_HAVE_SPARSE_IRQ_DEBUG - { - /* dump the results */ - struct irq_cfg *cfg; - unsigned long phys; - unsigned long bytes = sizeof(struct irq_cfg); - - printk(KERN_DEBUG "=========================== %d\n", irq); - printk(KERN_DEBUG "irq_cfg dump after get that for %d\n", irq); - for_each_irq_cfg(cfg) { - phys = __pa(cfg); - printk(KERN_DEBUG "irq_cfg %d ==> [%#lx - %#lx]\n", cfg->irq, phys, phys + bytes); - } - printk(KERN_DEBUG "===========================\n"); - } -#endif - return cfg; -} - -/* - * This is performance-critical, we want to do it O(1) - * - * the indexing order of this array favors 1:1 mappings - * between pins and IRQs. - */ - -struct irq_pin_list { - int apic, pin; - struct irq_pin_list *next; -}; - -static struct irq_pin_list *irq_2_pin_head; -/* fill one page ? */ -static int nr_irq_2_pin = 0x100; -static struct irq_pin_list *irq_2_pin_ptr; -static void __init irq_2_pin_init_work(void *data) -{ - struct dyn_array *da = data; - struct irq_pin_list *pin; - int i; - - pin = *da->name; - - for (i = 1; i < *da->nr; i++) - pin[i-1].next = &pin[i]; - - irq_2_pin_ptr = &pin[0]; -} -DEFINE_DYN_ARRAY(irq_2_pin_head, sizeof(struct irq_pin_list), nr_irq_2_pin, PAGE_SIZE, irq_2_pin_init_work); - -static struct irq_pin_list *get_one_free_irq_2_pin(void) -{ - struct irq_pin_list *pin; - int i; - - pin = irq_2_pin_ptr; - - if (pin) { - irq_2_pin_ptr = pin->next; - pin->next = NULL; - return pin; - } - - /* - * we run out of pre-allocate ones, allocate more - */ - printk(KERN_DEBUG "try to get more irq_2_pin %d\n", nr_irq_2_pin); - - if (after_bootmem) - pin = kzalloc(sizeof(struct irq_pin_list)*nr_irq_2_pin, - GFP_ATOMIC); - else - pin = __alloc_bootmem_nopanic(sizeof(struct irq_pin_list) * - nr_irq_2_pin, PAGE_SIZE, 0); - - if (!pin) - panic("can not get more irq_2_pin\n"); - - for (i = 1; i < nr_irq_2_pin; i++) - pin[i-1].next = &pin[i]; - - irq_2_pin_ptr = pin->next; - pin->next = NULL; - - return pin; -} - -struct io_apic { - unsigned int index; - unsigned int unused[3]; - unsigned int data; -}; - -static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) -{ - return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) - + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK); -} - -static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - writel(reg, &io_apic->index); - return readl(&io_apic->data); -} - -static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - writel(reg, &io_apic->index); - writel(value, &io_apic->data); -} - -/* - * Re-write a value: to be used for read-modify-write - * cycles where the read already set up the index register. - * - * Older SiS APIC requires we rewrite the index register - */ -static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - if (sis_apic_bug) - writel(reg, &io_apic->index); - writel(value, &io_apic->data); -} - -#ifdef CONFIG_X86_64 -static bool io_apic_level_ack_pending(unsigned int irq) -{ - struct irq_pin_list *entry; - unsigned long flags; - struct irq_cfg *cfg = irq_cfg(irq); - - spin_lock_irqsave(&ioapic_lock, flags); - entry = cfg->irq_2_pin; - for (;;) { - unsigned int reg; - int pin; - - if (!entry) - break; - pin = entry->pin; - reg = io_apic_read(entry->apic, 0x10 + pin*2); - /* Is the remote IRR bit set? */ - if (reg & IO_APIC_REDIR_REMOTE_IRR) { - spin_unlock_irqrestore(&ioapic_lock, flags); - return true; - } - if (!entry->next) - break; - entry = entry->next; - } - spin_unlock_irqrestore(&ioapic_lock, flags); - - return false; -} -#endif - -union entry_union { - struct { u32 w1, w2; }; - struct IO_APIC_route_entry entry; -}; - -static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) -{ - union entry_union eu; - unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); - eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); - eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); - return eu.entry; -} - -/* - * When we write a new IO APIC routing entry, we need to write the high - * word first! If the mask bit in the low word is clear, we will enable - * the interrupt, and we need to make sure the entry is fully populated - * before that happens. - */ -static void -__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) -{ - union entry_union eu; - eu.entry = e; - io_apic_write(apic, 0x11 + 2*pin, eu.w2); - io_apic_write(apic, 0x10 + 2*pin, eu.w1); -} - -static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) -{ - unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); - __ioapic_write_entry(apic, pin, e); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -/* - * When we mask an IO APIC routing entry, we need to write the low - * word first, in order to set the mask bit before we change the - * high bits! - */ -static void ioapic_mask_entry(int apic, int pin) -{ - unsigned long flags; - union entry_union eu = { .entry.mask = 1 }; - - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x10 + 2*pin, eu.w1); - io_apic_write(apic, 0x11 + 2*pin, eu.w2); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -#ifdef CONFIG_SMP -static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) -{ - int apic, pin; - struct irq_cfg *cfg; - struct irq_pin_list *entry; - - cfg = irq_cfg(irq); - entry = cfg->irq_2_pin; - for (;;) { - unsigned int reg; - - if (!entry) - break; - - apic = entry->apic; - pin = entry->pin; -#ifdef CONFIG_INTR_REMAP - /* - * With interrupt-remapping, destination information comes - * from interrupt-remapping table entry. - */ - if (!irq_remapped(irq)) - io_apic_write(apic, 0x11 + pin*2, dest); -#else - io_apic_write(apic, 0x11 + pin*2, dest); -#endif - reg = io_apic_read(apic, 0x10 + pin*2); - reg &= ~IO_APIC_REDIR_VECTOR_MASK; - reg |= vector; - io_apic_modify(apic, 0x10 + pin*2, reg); - if (!entry->next) - break; - entry = entry->next; - } -} - -static int assign_irq_vector(int irq, cpumask_t mask); - -static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) -{ - struct irq_cfg *cfg; - unsigned long flags; - unsigned int dest; - cpumask_t tmp; - struct irq_desc *desc; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; - - cfg = irq_cfg(irq); - if (assign_irq_vector(irq, mask)) - return; - - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - /* - * Only the high 8 bits are valid. - */ - dest = SET_APIC_LOGICAL_ID(dest); - - desc = irq_to_desc(irq); - spin_lock_irqsave(&ioapic_lock, flags); - __target_IO_APIC_irq(irq, dest, cfg->vector); - desc->affinity = mask; - spin_unlock_irqrestore(&ioapic_lock, flags); -} -#endif /* CONFIG_SMP */ - -/* - * The common case is 1:1 IRQ<->pin mappings. Sometimes there are - * shared ISA-space IRQs, so we have to support them. We are super - * fast in the common case, and fast for shared ISA-space IRQs. - */ -static void add_pin_to_irq(unsigned int irq, int apic, int pin) -{ - struct irq_cfg *cfg; - struct irq_pin_list *entry; - - /* first time to refer irq_cfg, so with new */ - cfg = irq_cfg_alloc(irq); - entry = cfg->irq_2_pin; - if (!entry) { - entry = get_one_free_irq_2_pin(); - cfg->irq_2_pin = entry; - entry->apic = apic; - entry->pin = pin; - printk(KERN_DEBUG " 0 add_pin_to_irq: irq %d --> apic %d pin %d\n", irq, apic, pin); - return; - } - - while (entry->next) { - /* not again, please */ - if (entry->apic == apic && entry->pin == pin) - return; - - entry = entry->next; - } - - entry->next = get_one_free_irq_2_pin(); - entry = entry->next; - entry->apic = apic; - entry->pin = pin; - printk(KERN_DEBUG " x add_pin_to_irq: irq %d --> apic %d pin %d\n", irq, apic, pin); -} - -/* - * Reroute an IRQ to a different pin. - */ -static void __init replace_pin_at_irq(unsigned int irq, - int oldapic, int oldpin, - int newapic, int newpin) -{ - struct irq_cfg *cfg = irq_cfg(irq); - struct irq_pin_list *entry = cfg->irq_2_pin; - int replaced = 0; - - while (entry) { - if (entry->apic == oldapic && entry->pin == oldpin) { - entry->apic = newapic; - entry->pin = newpin; - replaced = 1; - /* every one is different, right? */ - break; - } - entry = entry->next; - } - - /* why? call replace before add? */ - if (!replaced) - add_pin_to_irq(irq, newapic, newpin); -} - -#ifdef CONFIG_X86_64 -/* - * Synchronize the IO-APIC and the CPU by doing - * a dummy read from the IO-APIC - */ -static inline void io_apic_sync(unsigned int apic) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - readl(&io_apic->data); -} - -#define __DO_ACTION(R, ACTION, FINAL) \ - \ -{ \ - int pin; \ - struct irq_cfg *cfg; \ - struct irq_pin_list *entry; \ - \ - cfg = irq_cfg(irq); \ - entry = cfg->irq_2_pin; \ - for (;;) { \ - unsigned int reg; \ - if (!entry) \ - break; \ - pin = entry->pin; \ - reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ - reg ACTION; \ - io_apic_modify(entry->apic, 0x10 + R + pin*2, reg); \ - FINAL; \ - if (!entry->next) \ - break; \ - entry = entry->next; \ - } \ -} - -#define DO_ACTION(name,R,ACTION, FINAL) \ - \ - static void name##_IO_APIC_irq (unsigned int irq) \ - __DO_ACTION(R, ACTION, FINAL) - -/* mask = 1 */ -DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic)) - -/* mask = 0 */ -DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, ) - -#else - -static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable) -{ - struct irq_cfg *cfg; - struct irq_pin_list *entry; - unsigned int pin, reg; - - cfg = irq_cfg(irq); - entry = cfg->irq_2_pin; - for (;;) { - if (!entry) - break; - pin = entry->pin; - reg = io_apic_read(entry->apic, 0x10 + pin*2); - reg &= ~disable; - reg |= enable; - io_apic_modify(entry->apic, 0x10 + pin*2, reg); - if (!entry->next) - break; - entry = entry->next; - } -} - -/* mask = 1 */ -static void __mask_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0); -} - -/* mask = 0 */ -static void __unmask_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED); -} - -/* mask = 1, trigger = 0 */ -static void __mask_and_edge_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, - IO_APIC_REDIR_LEVEL_TRIGGER); -} - -/* mask = 0, trigger = 1 */ -static void __unmask_and_level_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER, - IO_APIC_REDIR_MASKED); -} - -#endif - -static void mask_IO_APIC_irq (unsigned int irq) -{ - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - __mask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -static void unmask_IO_APIC_irq (unsigned int irq) -{ - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - __unmask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) -{ - struct IO_APIC_route_entry entry; - - /* Check delivery_mode to be sure we're not clearing an SMI pin */ - entry = ioapic_read_entry(apic, pin); - if (entry.delivery_mode == dest_SMI) - return; - /* - * Disable it in the IO-APIC irq-routing table: - */ - ioapic_mask_entry(apic, pin); -} - -static void clear_IO_APIC (void) -{ - int apic, pin; - - for (apic = 0; apic < nr_ioapics; apic++) - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) - clear_IO_APIC_pin(apic, pin); -} - -#if !defined(CONFIG_SMP) && defined(CONFIG_X86_32) -void send_IPI_self(int vector) -{ - unsigned int cfg; - - /* - * Wait for idle. - */ - apic_wait_icr_idle(); - cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL; - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - apic_write(APIC_ICR, cfg); -} -#endif /* !CONFIG_SMP && CONFIG_X86_32*/ - -#ifdef CONFIG_X86_32 -/* - * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to - * specific CPU-side IRQs. - */ - -#define MAX_PIRQS 8 -static int pirq_entries [MAX_PIRQS]; -static int pirqs_enabled; - -static int __init ioapic_pirq_setup(char *str) -{ - int i, max; - int ints[MAX_PIRQS+1]; - - get_options(str, ARRAY_SIZE(ints), ints); - - for (i = 0; i < MAX_PIRQS; i++) - pirq_entries[i] = -1; - - pirqs_enabled = 1; - apic_printk(APIC_VERBOSE, KERN_INFO - "PIRQ redirection, working around broken MP-BIOS.\n"); - max = MAX_PIRQS; - if (ints[0] < MAX_PIRQS) - max = ints[0]; - - for (i = 0; i < max; i++) { - apic_printk(APIC_VERBOSE, KERN_DEBUG - "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); - /* - * PIRQs are mapped upside down, usually. - */ - pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; - } - return 1; -} - -__setup("pirq=", ioapic_pirq_setup); -#endif /* CONFIG_X86_32 */ - -#ifdef CONFIG_INTR_REMAP -/* I/O APIC RTE contents at the OS boot up */ -static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS]; - -/* - * Saves and masks all the unmasked IO-APIC RTE's - */ -int save_mask_IO_APIC_setup(void) -{ - union IO_APIC_reg_01 reg_01; - unsigned long flags; - int apic, pin; - - /* - * The number of IO-APIC IRQ registers (== #pins): - */ - for (apic = 0; apic < nr_ioapics; apic++) { - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(apic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - nr_ioapic_registers[apic] = reg_01.bits.entries+1; - } - - for (apic = 0; apic < nr_ioapics; apic++) { - early_ioapic_entries[apic] = - kzalloc(sizeof(struct IO_APIC_route_entry) * - nr_ioapic_registers[apic], GFP_KERNEL); - if (!early_ioapic_entries[apic]) - return -ENOMEM; - } - - for (apic = 0; apic < nr_ioapics; apic++) - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - struct IO_APIC_route_entry entry; - - entry = early_ioapic_entries[apic][pin] = - ioapic_read_entry(apic, pin); - if (!entry.mask) { - entry.mask = 1; - ioapic_write_entry(apic, pin, entry); - } - } - return 0; -} - -void restore_IO_APIC_setup(void) -{ - int apic, pin; - - for (apic = 0; apic < nr_ioapics; apic++) - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) - ioapic_write_entry(apic, pin, - early_ioapic_entries[apic][pin]); -} - -void reinit_intr_remapped_IO_APIC(int intr_remapping) -{ - /* - * for now plain restore of previous settings. - * TBD: In the case of OS enabling interrupt-remapping, - * IO-APIC RTE's need to be setup to point to interrupt-remapping - * table entries. for now, do a plain restore, and wait for - * the setup_IO_APIC_irqs() to do proper initialization. - */ - restore_IO_APIC_setup(); -} -#endif - -/* - * Find the IRQ entry number of a certain pin. - */ -static int find_irq_entry(int apic, int pin, int type) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mp_irqtype == type && - (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid || - mp_irqs[i].mp_dstapic == MP_APIC_ALL) && - mp_irqs[i].mp_dstirq == pin) - return i; - - return -1; -} - -/* - * Find the pin to which IRQ[irq] (ISA) is connected - */ -static int __init find_isa_irq_pin(int irq, int type) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mp_srcbus; - - if (test_bit(lbus, mp_bus_not_pci) && - (mp_irqs[i].mp_irqtype == type) && - (mp_irqs[i].mp_srcbusirq == irq)) - - return mp_irqs[i].mp_dstirq; - } - return -1; -} - -static int __init find_isa_irq_apic(int irq, int type) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mp_srcbus; - - if (test_bit(lbus, mp_bus_not_pci) && - (mp_irqs[i].mp_irqtype == type) && - (mp_irqs[i].mp_srcbusirq == irq)) - break; - } - if (i < mp_irq_entries) { - int apic; - for(apic = 0; apic < nr_ioapics; apic++) { - if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic) - return apic; - } - } - - return -1; -} - -/* - * Find a specific PCI IRQ entry. - * Not an __init, possibly needed by modules - */ -static int pin_2_irq(int idx, int apic, int pin); - -int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) -{ - int apic, i, best_guess = -1; - - apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", - bus, slot, pin); - if (test_bit(bus, mp_bus_not_pci)) { - apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); - return -1; - } - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mp_srcbus; - - for (apic = 0; apic < nr_ioapics; apic++) - if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic || - mp_irqs[i].mp_dstapic == MP_APIC_ALL) - break; - - if (!test_bit(lbus, mp_bus_not_pci) && - !mp_irqs[i].mp_irqtype && - (bus == lbus) && - (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) { - int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq); - - if (!(apic || IO_APIC_IRQ(irq))) - continue; - - if (pin == (mp_irqs[i].mp_srcbusirq & 3)) - return irq; - /* - * Use the first all-but-pin matching entry as a - * best-guess fuzzy result for broken mptables. - */ - if (best_guess < 0) - best_guess = irq; - } - } - return best_guess; -} - -EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); - -#if defined(CONFIG_EISA) || defined(CONFIG_MCA) -/* - * EISA Edge/Level control register, ELCR - */ -static int EISA_ELCR(unsigned int irq) -{ - if (irq < 16) { - unsigned int port = 0x4d0 + (irq >> 3); - return (inb(port) >> (irq & 7)) & 1; - } - apic_printk(APIC_VERBOSE, KERN_INFO - "Broken MPtable reports ISA irq %d\n", irq); - return 0; -} - -#endif - -/* ISA interrupts are always polarity zero edge triggered, - * when listed as conforming in the MP table. */ - -#define default_ISA_trigger(idx) (0) -#define default_ISA_polarity(idx) (0) - -/* EISA interrupts are always polarity zero and can be edge or level - * trigger depending on the ELCR value. If an interrupt is listed as - * EISA conforming in the MP table, that means its trigger type must - * be read in from the ELCR */ - -#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq)) -#define default_EISA_polarity(idx) default_ISA_polarity(idx) - -/* PCI interrupts are always polarity one level triggered, - * when listed as conforming in the MP table. */ - -#define default_PCI_trigger(idx) (1) -#define default_PCI_polarity(idx) (1) - -/* MCA interrupts are always polarity zero level triggered, - * when listed as conforming in the MP table. */ - -#define default_MCA_trigger(idx) (1) -#define default_MCA_polarity(idx) default_ISA_polarity(idx) - -static int MPBIOS_polarity(int idx) -{ - int bus = mp_irqs[idx].mp_srcbus; - int polarity; - - /* - * Determine IRQ line polarity (high active or low active): - */ - switch (mp_irqs[idx].mp_irqflag & 3) - { - case 0: /* conforms, ie. bus-type dependent polarity */ - if (test_bit(bus, mp_bus_not_pci)) - polarity = default_ISA_polarity(idx); - else - polarity = default_PCI_polarity(idx); - break; - case 1: /* high active */ - { - polarity = 0; - break; - } - case 2: /* reserved */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - case 3: /* low active */ - { - polarity = 1; - break; - } - default: /* invalid */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - } - return polarity; -} - -static int MPBIOS_trigger(int idx) -{ - int bus = mp_irqs[idx].mp_srcbus; - int trigger; - - /* - * Determine IRQ trigger mode (edge or level sensitive): - */ - switch ((mp_irqs[idx].mp_irqflag>>2) & 3) - { - case 0: /* conforms, ie. bus-type dependent */ - if (test_bit(bus, mp_bus_not_pci)) - trigger = default_ISA_trigger(idx); - else - trigger = default_PCI_trigger(idx); -#if defined(CONFIG_EISA) || defined(CONFIG_MCA) - switch (mp_bus_id_to_type[bus]) { - case MP_BUS_ISA: /* ISA pin */ - { - /* set before the switch */ - break; - } - case MP_BUS_EISA: /* EISA pin */ - { - trigger = default_EISA_trigger(idx); - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - /* set before the switch */ - break; - } - case MP_BUS_MCA: /* MCA pin */ - { - trigger = default_MCA_trigger(idx); - break; - } - default: - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 1; - break; - } - } -#endif - break; - case 1: /* edge */ - { - trigger = 0; - break; - } - case 2: /* reserved */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 1; - break; - } - case 3: /* level */ - { - trigger = 1; - break; - } - default: /* invalid */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 0; - break; - } - } - return trigger; -} - -static inline int irq_polarity(int idx) -{ - return MPBIOS_polarity(idx); -} - -static inline int irq_trigger(int idx) -{ - return MPBIOS_trigger(idx); -} - -int (*ioapic_renumber_irq)(int ioapic, int irq); -static int pin_2_irq(int idx, int apic, int pin) -{ - int irq, i; - int bus = mp_irqs[idx].mp_srcbus; - - /* - * Debugging check, we are in big trouble if this message pops up! - */ - if (mp_irqs[idx].mp_dstirq != pin) - printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); - - if (test_bit(bus, mp_bus_not_pci)) { - irq = mp_irqs[idx].mp_srcbusirq; - } else { - /* - * PCI IRQs are mapped in order - */ - i = irq = 0; - while (i < apic) - irq += nr_ioapic_registers[i++]; - irq += pin; - /* - * For MPS mode, so far only needed by ES7000 platform - */ - if (ioapic_renumber_irq) - irq = ioapic_renumber_irq(apic, irq); - } - -#ifdef CONFIG_X86_32 - /* - * PCI IRQ command line redirection. Yes, limits are hardcoded. - */ - if ((pin >= 16) && (pin <= 23)) { - if (pirq_entries[pin-16] != -1) { - if (!pirq_entries[pin-16]) { - apic_printk(APIC_VERBOSE, KERN_DEBUG - "disabling PIRQ%d\n", pin-16); - } else { - irq = pirq_entries[pin-16]; - apic_printk(APIC_VERBOSE, KERN_DEBUG - "using PIRQ%d -> IRQ %d\n", - pin-16, irq); - } - } - } -#endif - - return irq; -} - -void lock_vector_lock(void) -{ - /* Used to the online set of cpus does not change - * during assign_irq_vector. - */ - spin_lock(&vector_lock); -} - -void unlock_vector_lock(void) -{ - spin_unlock(&vector_lock); -} - -static int __assign_irq_vector(int irq, cpumask_t mask) -{ - /* - * NOTE! The local APIC isn't very good at handling - * multiple interrupts at the same interrupt level. - * As the interrupt level is determined by taking the - * vector number and shifting that right by 4, we - * want to spread these out a bit so that they don't - * all fall in the same interrupt level. - * - * Also, we've got to be careful not to trash gate - * 0x80, because int 0x80 is hm, kind of importantish. ;) - */ - static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; - unsigned int old_vector; - int cpu; - struct irq_cfg *cfg; - - cfg = irq_cfg(irq); - - /* Only try and allocate irqs on cpus that are present */ - cpus_and(mask, mask, cpu_online_map); - - if ((cfg->move_in_progress) || cfg->move_cleanup_count) - return -EBUSY; - - old_vector = cfg->vector; - if (old_vector) { - cpumask_t tmp; - cpus_and(tmp, cfg->domain, mask); - if (!cpus_empty(tmp)) - return 0; - } - - for_each_cpu_mask_nr(cpu, mask) { - cpumask_t domain, new_mask; - int new_cpu; - int vector, offset; - - domain = vector_allocation_domain(cpu); - cpus_and(new_mask, domain, cpu_online_map); - - vector = current_vector; - offset = current_offset; -next: - vector += 8; - if (vector >= first_system_vector) { - /* If we run out of vectors on large boxen, must share them. */ - offset = (offset + 1) % 8; - vector = FIRST_DEVICE_VECTOR + offset; - } - if (unlikely(current_vector == vector)) - continue; -#ifdef CONFIG_X86_64 - if (vector == IA32_SYSCALL_VECTOR) - goto next; -#else - if (vector == SYSCALL_VECTOR) - goto next; -#endif - for_each_cpu_mask_nr(new_cpu, new_mask) - if (per_cpu(vector_irq, new_cpu)[vector] != -1) - goto next; - /* Found one! */ - current_vector = vector; - current_offset = offset; - if (old_vector) { - cfg->move_in_progress = 1; - cfg->old_domain = cfg->domain; - } - for_each_cpu_mask_nr(new_cpu, new_mask) - per_cpu(vector_irq, new_cpu)[vector] = irq; - cfg->vector = vector; - cfg->domain = domain; - return 0; - } - return -ENOSPC; -} - -static int assign_irq_vector(int irq, cpumask_t mask) -{ - int err; - unsigned long flags; - - spin_lock_irqsave(&vector_lock, flags); - err = __assign_irq_vector(irq, mask); - spin_unlock_irqrestore(&vector_lock, flags); - return err; -} - -static void __clear_irq_vector(int irq) -{ - struct irq_cfg *cfg; - cpumask_t mask; - int cpu, vector; - - cfg = irq_cfg(irq); - BUG_ON(!cfg->vector); - - vector = cfg->vector; - cpus_and(mask, cfg->domain, cpu_online_map); - for_each_cpu_mask_nr(cpu, mask) - per_cpu(vector_irq, cpu)[vector] = -1; - - cfg->vector = 0; - cpus_clear(cfg->domain); -} - -void __setup_vector_irq(int cpu) -{ - /* Initialize vector_irq on a new cpu */ - /* This function must be called with vector_lock held */ - int irq, vector; - struct irq_cfg *cfg; - - /* Mark the inuse vectors */ - for_each_irq_cfg(cfg) { - if (!cpu_isset(cpu, cfg->domain)) - continue; - vector = cfg->vector; - irq = cfg->irq; - per_cpu(vector_irq, cpu)[vector] = irq; - } - /* Mark the free vectors */ - for (vector = 0; vector < NR_VECTORS; ++vector) { - irq = per_cpu(vector_irq, cpu)[vector]; - if (irq < 0) - continue; - - cfg = irq_cfg(irq); - if (!cpu_isset(cpu, cfg->domain)) - per_cpu(vector_irq, cpu)[vector] = -1; - } -} - -static struct irq_chip ioapic_chip; -#ifdef CONFIG_INTR_REMAP -static struct irq_chip ir_ioapic_chip; -#endif - -#define IOAPIC_AUTO -1 -#define IOAPIC_EDGE 0 -#define IOAPIC_LEVEL 1 - -#ifdef CONFIG_X86_32 -static inline int IO_APIC_irq_trigger(int irq) -{ - int apic, idx, pin; - - for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - idx = find_irq_entry(apic, pin, mp_INT); - if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) - return irq_trigger(idx); - } - } - /* - * nonexistent IRQs are edge default - */ - return 0; -} -#else -static inline int IO_APIC_irq_trigger(int irq) -{ - return 1; -} -#endif - -static void ioapic_register_intr(int irq, unsigned long trigger) -{ - struct irq_desc *desc; - - /* first time to use this irq_desc */ - if (irq < 16) - desc = irq_to_desc(irq); - else - desc = irq_to_desc_alloc(irq); - - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) - desc->status |= IRQ_LEVEL; - else - desc->status &= ~IRQ_LEVEL; - -#ifdef CONFIG_INTR_REMAP - if (irq_remapped(irq)) { - desc->status |= IRQ_MOVE_PCNTXT; - if (trigger) - set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, - handle_fasteoi_irq, - "fasteoi"); - else - set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, - handle_edge_irq, "edge"); - return; - } -#endif - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_fasteoi_irq, - "fasteoi"); - else - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_edge_irq, "edge"); -} - -static int setup_ioapic_entry(int apic, int irq, - struct IO_APIC_route_entry *entry, - unsigned int destination, int trigger, - int polarity, int vector) -{ - /* - * add it to the IO-APIC irq-routing table: - */ - memset(entry,0,sizeof(*entry)); - -#ifdef CONFIG_INTR_REMAP - if (intr_remapping_enabled) { - struct intel_iommu *iommu = map_ioapic_to_ir(apic); - struct irte irte; - struct IR_IO_APIC_route_entry *ir_entry = - (struct IR_IO_APIC_route_entry *) entry; - int index; - - if (!iommu) - panic("No mapping iommu for ioapic %d\n", apic); - - index = alloc_irte(iommu, irq, 1); - if (index < 0) - panic("Failed to allocate IRTE for ioapic %d\n", apic); - - memset(&irte, 0, sizeof(irte)); - - irte.present = 1; - irte.dst_mode = INT_DEST_MODE; - irte.trigger_mode = trigger; - irte.dlvry_mode = INT_DELIVERY_MODE; - irte.vector = vector; - irte.dest_id = IRTE_DEST(destination); - - modify_irte(irq, &irte); - - ir_entry->index2 = (index >> 15) & 0x1; - ir_entry->zero = 0; - ir_entry->format = 1; - ir_entry->index = (index & 0x7fff); - } else -#endif - { - entry->delivery_mode = INT_DELIVERY_MODE; - entry->dest_mode = INT_DEST_MODE; - entry->dest = destination; - } - - entry->mask = 0; /* enable IRQ */ - entry->trigger = trigger; - entry->polarity = polarity; - entry->vector = vector; - - /* Mask level triggered irqs. - * Use IRQ_DELAYED_DISABLE for edge triggered irqs. - */ - if (trigger) - entry->mask = 1; - return 0; -} - -static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, - int trigger, int polarity) -{ - struct irq_cfg *cfg; - struct IO_APIC_route_entry entry; - cpumask_t mask; - - if (!IO_APIC_IRQ(irq)) - return; - - cfg = irq_cfg(irq); - - mask = TARGET_CPUS; - if (assign_irq_vector(irq, mask)) - return; - - cpus_and(mask, cfg->domain, mask); - - apic_printk(APIC_VERBOSE,KERN_DEBUG - "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " - "IRQ %d Mode:%i Active:%i)\n", - apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector, - irq, trigger, polarity); - - - if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry, - cpu_mask_to_apicid(mask), trigger, polarity, - cfg->vector)) { - printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", - mp_ioapics[apic].mp_apicid, pin); - __clear_irq_vector(irq); - return; - } - - ioapic_register_intr(irq, trigger); - if (irq < 16) - disable_8259A_irq(irq); - - ioapic_write_entry(apic, pin, entry); -} - -static void __init setup_IO_APIC_irqs(void) -{ - int apic, pin, idx, irq, first_notcon = 1; - - apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); - - for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - - idx = find_irq_entry(apic,pin,mp_INT); - if (idx == -1) { - if (first_notcon) { - apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin); - first_notcon = 0; - } else - apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin); - continue; - } - if (!first_notcon) { - apic_printk(APIC_VERBOSE, " not connected.\n"); - first_notcon = 1; - } - - irq = pin_2_irq(idx, apic, pin); -#ifdef CONFIG_X86_32 - if (multi_timer_check(apic, irq)) - continue; -#endif - add_pin_to_irq(irq, apic, pin); - - setup_IO_APIC_irq(apic, pin, irq, - irq_trigger(idx), irq_polarity(idx)); - } - } - - if (!first_notcon) - apic_printk(APIC_VERBOSE, " not connected.\n"); -} - -/* - * Set up the timer pin, possibly with the 8259A-master behind. - */ -static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, - int vector) -{ - struct IO_APIC_route_entry entry; - -#ifdef CONFIG_INTR_REMAP - if (intr_remapping_enabled) - return; -#endif - - memset(&entry, 0, sizeof(entry)); - - /* - * We use logical delivery to get the timer IRQ - * to the first CPU. - */ - entry.dest_mode = INT_DEST_MODE; - entry.mask = 1; /* mask IRQ now */ - entry.dest = cpu_mask_to_apicid(TARGET_CPUS); - entry.delivery_mode = INT_DELIVERY_MODE; - entry.polarity = 0; - entry.trigger = 0; - entry.vector = vector; - - /* - * The timer IRQ doesn't have to know that behind the - * scene we may have a 8259A-master in AEOI mode ... - */ - set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); - - /* - * Add it to the IO-APIC irq-routing table: - */ - ioapic_write_entry(apic, pin, entry); -} - - -__apicdebuginit(void) print_IO_APIC(void) -{ - int apic, i; - union IO_APIC_reg_00 reg_00; - union IO_APIC_reg_01 reg_01; - union IO_APIC_reg_02 reg_02; - union IO_APIC_reg_03 reg_03; - unsigned long flags; - struct irq_cfg *cfg; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); - for (i = 0; i < nr_ioapics; i++) - printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", - mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]); - - /* - * We are a bit conservative about what we expect. We have to - * know about every hardware change ASAP. - */ - printk(KERN_INFO "testing the IO APIC.......................\n"); - - for (apic = 0; apic < nr_ioapics; apic++) { - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - reg_01.raw = io_apic_read(apic, 1); - if (reg_01.bits.version >= 0x10) - reg_02.raw = io_apic_read(apic, 2); - if (reg_01.bits.version >= 0x20) - reg_03.raw = io_apic_read(apic, 3); - spin_unlock_irqrestore(&ioapic_lock, flags); - - printk("\n"); - printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); - printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); - printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); - printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); - printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); - - printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); - printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); - - printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); - printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); - - /* - * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, - * but the value of reg_02 is read as the previous read register - * value, so ignore it if reg_02 == reg_01. - */ - if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { - printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); - printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); - } - - /* - * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 - * or reg_03, but the value of reg_0[23] is read as the previous read - * register value, so ignore it if reg_03 == reg_0[12]. - */ - if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && - reg_03.raw != reg_01.raw) { - printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); - printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); - } - - printk(KERN_DEBUG ".... IRQ redirection table:\n"); - - printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" - " Stat Dmod Deli Vect: \n"); - - for (i = 0; i <= reg_01.bits.entries; i++) { - struct IO_APIC_route_entry entry; - - entry = ioapic_read_entry(apic, i); - - printk(KERN_DEBUG " %02x %03X ", - i, - entry.dest - ); - - printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", - entry.mask, - entry.trigger, - entry.irr, - entry.polarity, - entry.delivery_status, - entry.dest_mode, - entry.delivery_mode, - entry.vector - ); - } - } - printk(KERN_DEBUG "IRQ to pin mappings:\n"); - for_each_irq_cfg(cfg) { - struct irq_pin_list *entry = cfg->irq_2_pin; - if (!entry) - continue; - printk(KERN_DEBUG "IRQ%d ", cfg->irq); - for (;;) { - printk("-> %d:%d", entry->apic, entry->pin); - if (!entry->next) - break; - entry = entry->next; - } - printk("\n"); - } - - printk(KERN_INFO ".................................... done.\n"); - - return; -} - -__apicdebuginit(void) print_APIC_bitfield(int base) -{ - unsigned int v; - int i, j; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); - for (i = 0; i < 8; i++) { - v = apic_read(base + i*0x10); - for (j = 0; j < 32; j++) { - if (v & (1< 3) /* Due to the Pentium erratum 3AP. */ - apic_write(APIC_ESR, 0); - - v = apic_read(APIC_ESR); - printk(KERN_DEBUG "... APIC ESR: %08x\n", v); - } - - icr = apic_icr_read(); - printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr); - printk(KERN_DEBUG "... APIC ICR2: %08x\n", (u32)(icr >> 32)); - - v = apic_read(APIC_LVTT); - printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); - - if (maxlvt > 3) { /* PC is LVT#4. */ - v = apic_read(APIC_LVTPC); - printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); - } - v = apic_read(APIC_LVT0); - printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); - v = apic_read(APIC_LVT1); - printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); - - if (maxlvt > 2) { /* ERR is LVT#3. */ - v = apic_read(APIC_LVTERR); - printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); - } - - v = apic_read(APIC_TMICT); - printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); - v = apic_read(APIC_TMCCT); - printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); - v = apic_read(APIC_TDCR); - printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); - printk("\n"); -} - -__apicdebuginit(void) print_all_local_APICs(void) -{ - on_each_cpu(print_local_APIC, NULL, 1); -} - -__apicdebuginit(void) print_PIC(void) -{ - unsigned int v; - unsigned long flags; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "\nprinting PIC contents\n"); - - spin_lock_irqsave(&i8259A_lock, flags); - - v = inb(0xa1) << 8 | inb(0x21); - printk(KERN_DEBUG "... PIC IMR: %04x\n", v); - - v = inb(0xa0) << 8 | inb(0x20); - printk(KERN_DEBUG "... PIC IRR: %04x\n", v); - - outb(0x0b,0xa0); - outb(0x0b,0x20); - v = inb(0xa0) << 8 | inb(0x20); - outb(0x0a,0xa0); - outb(0x0a,0x20); - - spin_unlock_irqrestore(&i8259A_lock, flags); - - printk(KERN_DEBUG "... PIC ISR: %04x\n", v); - - v = inb(0x4d1) << 8 | inb(0x4d0); - printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); -} - -__apicdebuginit(int) print_all_ICs(void) -{ - print_PIC(); - print_all_local_APICs(); - print_IO_APIC(); - - return 0; -} - -fs_initcall(print_all_ICs); - - -/* Where if anywhere is the i8259 connect in external int mode */ -static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; - -void __init enable_IO_APIC(void) -{ - union IO_APIC_reg_01 reg_01; - int i8259_apic, i8259_pin; - int apic; - unsigned long flags; - -#ifdef CONFIG_X86_32 - int i; - if (!pirqs_enabled) - for (i = 0; i < MAX_PIRQS; i++) - pirq_entries[i] = -1; -#endif - - /* - * The number of IO-APIC IRQ registers (== #pins): - */ - for (apic = 0; apic < nr_ioapics; apic++) { - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(apic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - nr_ioapic_registers[apic] = reg_01.bits.entries+1; - } - for(apic = 0; apic < nr_ioapics; apic++) { - int pin; - /* See if any of the pins is in ExtINT mode */ - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - struct IO_APIC_route_entry entry; - entry = ioapic_read_entry(apic, pin); - - /* If the interrupt line is enabled and in ExtInt mode - * I have found the pin where the i8259 is connected. - */ - if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { - ioapic_i8259.apic = apic; - ioapic_i8259.pin = pin; - goto found_i8259; - } - } - } - found_i8259: - /* Look to see what if the MP table has reported the ExtINT */ - /* If we could not find the appropriate pin by looking at the ioapic - * the i8259 probably is not connected the ioapic but give the - * mptable a chance anyway. - */ - i8259_pin = find_isa_irq_pin(0, mp_ExtINT); - i8259_apic = find_isa_irq_apic(0, mp_ExtINT); - /* Trust the MP table if nothing is setup in the hardware */ - if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { - printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); - ioapic_i8259.pin = i8259_pin; - ioapic_i8259.apic = i8259_apic; - } - /* Complain if the MP table and the hardware disagree */ - if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && - (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) - { - printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); - } - - /* - * Do not trust the IO-APIC being empty at bootup - */ - clear_IO_APIC(); -} - -/* - * Not an __init, needed by the reboot code - */ -void disable_IO_APIC(void) -{ - /* - * Clear the IO-APIC before rebooting: - */ - clear_IO_APIC(); - - /* - * If the i8259 is routed through an IOAPIC - * Put that IOAPIC in virtual wire mode - * so legacy interrupts can be delivered. - */ - if (ioapic_i8259.pin != -1) { - struct IO_APIC_route_entry entry; - - memset(&entry, 0, sizeof(entry)); - entry.mask = 0; /* Enabled */ - entry.trigger = 0; /* Edge */ - entry.irr = 0; - entry.polarity = 0; /* High */ - entry.delivery_status = 0; - entry.dest_mode = 0; /* Physical */ - entry.delivery_mode = dest_ExtINT; /* ExtInt */ - entry.vector = 0; - entry.dest = read_apic_id(); - - /* - * Add it to the IO-APIC irq-routing table: - */ - ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); - } - - disconnect_bsp_APIC(ioapic_i8259.pin != -1); -} - -#ifdef CONFIG_X86_32 -/* - * function to set the IO-APIC physical IDs based on the - * values stored in the MPC table. - * - * by Matt Domsch Tue Dec 21 12:25:05 CST 1999 - */ - -static void __init setup_ioapic_ids_from_mpc(void) -{ - union IO_APIC_reg_00 reg_00; - physid_mask_t phys_id_present_map; - int apic; - int i; - unsigned char old_id; - unsigned long flags; - - if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids()) - return; - - /* - * Don't check I/O APIC IDs for xAPIC systems. They have - * no meaning without the serial APIC bus. - */ - if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) - return; - /* - * This is broken; anything with a real cpu count has to - * circumvent this idiocy regardless. - */ - phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); - - /* - * Set the IOAPIC ID to the value stored in the MPC table. - */ - for (apic = 0; apic < nr_ioapics; apic++) { - - /* Read the register 0 value */ - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - old_id = mp_ioapics[apic].mp_apicid; - - if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) { - printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", - apic, mp_ioapics[apic].mp_apicid); - printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", - reg_00.bits.ID); - mp_ioapics[apic].mp_apicid = reg_00.bits.ID; - } - - /* - * Sanity check, is the ID really free? Every APIC in a - * system must have a unique ID or we get lots of nice - * 'stuck on smp_invalidate_needed IPI wait' messages. - */ - if (check_apicid_used(phys_id_present_map, - mp_ioapics[apic].mp_apicid)) { - printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", - apic, mp_ioapics[apic].mp_apicid); - for (i = 0; i < get_physical_broadcast(); i++) - if (!physid_isset(i, phys_id_present_map)) - break; - if (i >= get_physical_broadcast()) - panic("Max APIC ID exceeded!\n"); - printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", - i); - physid_set(i, phys_id_present_map); - mp_ioapics[apic].mp_apicid = i; - } else { - physid_mask_t tmp; - tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid); - apic_printk(APIC_VERBOSE, "Setting %d in the " - "phys_id_present_map\n", - mp_ioapics[apic].mp_apicid); - physids_or(phys_id_present_map, phys_id_present_map, tmp); - } - - - /* - * We need to adjust the IRQ routing table - * if the ID changed. - */ - if (old_id != mp_ioapics[apic].mp_apicid) - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mp_dstapic == old_id) - mp_irqs[i].mp_dstapic - = mp_ioapics[apic].mp_apicid; - - /* - * Read the right value from the MPC table and - * write it into the ID register. - */ - apic_printk(APIC_VERBOSE, KERN_INFO - "...changing IO-APIC physical APIC ID to %d ...", - mp_ioapics[apic].mp_apicid); - - reg_00.bits.ID = mp_ioapics[apic].mp_apicid; - spin_lock_irqsave(&ioapic_lock, flags); - - /* - * Sanity check - */ - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid) - printk("could not set ID!\n"); - else - apic_printk(APIC_VERBOSE, " ok.\n"); - } -} -#endif - -int no_timer_check __initdata; - -static int __init notimercheck(char *s) -{ - no_timer_check = 1; - return 1; -} -__setup("no_timer_check", notimercheck); - -/* - * There is a nasty bug in some older SMP boards, their mptable lies - * about the timer IRQ. We do the following to work around the situation: - * - * - timer IRQ defaults to IO-APIC IRQ - * - if this function detects that timer IRQs are defunct, then we fall - * back to ISA timer IRQs - */ -static int __init timer_irq_works(void) -{ - unsigned long t1 = jiffies; - unsigned long flags; - - if (no_timer_check) - return 1; - - local_save_flags(flags); - local_irq_enable(); - /* Let ten ticks pass... */ - mdelay((10 * 1000) / HZ); - local_irq_restore(flags); - - /* - * Expect a few ticks at least, to be sure some possible - * glue logic does not lock up after one or two first - * ticks in a non-ExtINT mode. Also the local APIC - * might have cached one ExtINT interrupt. Finally, at - * least one tick may be lost due to delays. - */ - - /* jiffies wrap? */ - if (time_after(jiffies, t1 + 4)) - return 1; - return 0; -} - -/* - * In the SMP+IOAPIC case it might happen that there are an unspecified - * number of pending IRQ events unhandled. These cases are very rare, - * so we 'resend' these IRQs via IPIs, to the same CPU. It's much - * better to do it this way as thus we do not have to be aware of - * 'pending' interrupts in the IRQ path, except at this point. - */ -/* - * Edge triggered needs to resend any interrupt - * that was delayed but this is now handled in the device - * independent code. - */ - -/* - * Starting up a edge-triggered IO-APIC interrupt is - * nasty - we need to make sure that we get the edge. - * If it is already asserted for some reason, we need - * return 1 to indicate that is was pending. - * - * This is not complete - we should be able to fake - * an edge even if it isn't on the 8259A... - */ - -static unsigned int startup_ioapic_irq(unsigned int irq) -{ - int was_pending = 0; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - if (irq < 16) { - disable_8259A_irq(irq); - if (i8259A_irq_pending(irq)) - was_pending = 1; - } - __unmask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return was_pending; -} - -#ifdef CONFIG_X86_64 -static int ioapic_retrigger_irq(unsigned int irq) -{ - - struct irq_cfg *cfg = irq_cfg(irq); - unsigned long flags; - - spin_lock_irqsave(&vector_lock, flags); - send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector); - spin_unlock_irqrestore(&vector_lock, flags); - - return 1; -} -#else -static int ioapic_retrigger_irq(unsigned int irq) -{ - send_IPI_self(irq_cfg(irq)->vector); - - return 1; -} -#endif - -/* - * Level and edge triggered IO-APIC interrupts need different handling, - * so we use two separate IRQ descriptors. Edge triggered IRQs can be - * handled with the level-triggered descriptor, but that one has slightly - * more overhead. Level-triggered interrupts cannot be handled with the - * edge-triggered handler, without risking IRQ storms and other ugly - * races. - */ - -#ifdef CONFIG_SMP - -#ifdef CONFIG_INTR_REMAP -static void ir_irq_migration(struct work_struct *work); - -static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration); - -/* - * Migrate the IO-APIC irq in the presence of intr-remapping. - * - * For edge triggered, irq migration is a simple atomic update(of vector - * and cpu destination) of IRTE and flush the hardware cache. - * - * For level triggered, we need to modify the io-apic RTE aswell with the update - * vector information, along with modifying IRTE with vector and destination. - * So irq migration for level triggered is little bit more complex compared to - * edge triggered migration. But the good news is, we use the same algorithm - * for level triggered migration as we have today, only difference being, - * we now initiate the irq migration from process context instead of the - * interrupt context. - * - * In future, when we do a directed EOI (combined with cpu EOI broadcast - * suppression) to the IO-APIC, level triggered irq migration will also be - * as simple as edge triggered migration and we can do the irq migration - * with a simple atomic update to IO-APIC RTE. - */ -static void migrate_ioapic_irq(int irq, cpumask_t mask) -{ - struct irq_cfg *cfg; - struct irq_desc *desc; - cpumask_t tmp, cleanup_mask; - struct irte irte; - int modify_ioapic_rte; - unsigned int dest; - unsigned long flags; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; - - if (get_irte(irq, &irte)) - return; - - if (assign_irq_vector(irq, mask)) - return; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - - desc = irq_to_desc(irq); - modify_ioapic_rte = desc->status & IRQ_LEVEL; - if (modify_ioapic_rte) { - spin_lock_irqsave(&ioapic_lock, flags); - __target_IO_APIC_irq(irq, dest, cfg->vector); - spin_unlock_irqrestore(&ioapic_lock, flags); - } - - irte.vector = cfg->vector; - irte.dest_id = IRTE_DEST(dest); - - /* - * Modified the IRTE and flushes the Interrupt entry cache. - */ - modify_irte(irq, &irte); - - if (cfg->move_in_progress) { - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); - cfg->move_cleanup_count = cpus_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - cfg->move_in_progress = 0; - } - - desc->affinity = mask; -} - -static int migrate_irq_remapped_level(int irq) -{ - int ret = -1; - struct irq_desc *desc = irq_to_desc(irq); - - mask_IO_APIC_irq(irq); - - if (io_apic_level_ack_pending(irq)) { - /* - * Interrupt in progress. Migrating irq now will change the - * vector information in the IO-APIC RTE and that will confuse - * the EOI broadcast performed by cpu. - * So, delay the irq migration to the next instance. - */ - schedule_delayed_work(&ir_migration_work, 1); - goto unmask; - } - - /* everthing is clear. we have right of way */ - migrate_ioapic_irq(irq, desc->pending_mask); - - ret = 0; - desc->status &= ~IRQ_MOVE_PENDING; - cpus_clear(desc->pending_mask); - -unmask: - unmask_IO_APIC_irq(irq); - return ret; -} - -static void ir_irq_migration(struct work_struct *work) -{ - unsigned int irq; - struct irq_desc *desc; - - for_each_irq_desc(irq, desc) { - if (desc->status & IRQ_MOVE_PENDING) { - unsigned long flags; - - spin_lock_irqsave(&desc->lock, flags); - if (!desc->chip->set_affinity || - !(desc->status & IRQ_MOVE_PENDING)) { - desc->status &= ~IRQ_MOVE_PENDING; - spin_unlock_irqrestore(&desc->lock, flags); - continue; - } - - desc->chip->set_affinity(irq, desc->pending_mask); - spin_unlock_irqrestore(&desc->lock, flags); - } - } -} - -/* - * Migrates the IRQ destination in the process context. - */ -static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) -{ - struct irq_desc *desc = irq_to_desc(irq); - - if (desc->status & IRQ_LEVEL) { - desc->status |= IRQ_MOVE_PENDING; - desc->pending_mask = mask; - migrate_irq_remapped_level(irq); - return; - } - - migrate_ioapic_irq(irq, mask); -} -#endif - -asmlinkage void smp_irq_move_cleanup_interrupt(void) -{ - unsigned vector, me; - ack_APIC_irq(); -#ifdef CONFIG_X86_64 - exit_idle(); -#endif - irq_enter(); - - me = smp_processor_id(); - for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { - unsigned int irq; - struct irq_desc *desc; - struct irq_cfg *cfg; - irq = __get_cpu_var(vector_irq)[vector]; - - desc = irq_to_desc(irq); - if (!desc) - continue; - - cfg = irq_cfg(irq); - spin_lock(&desc->lock); - if (!cfg->move_cleanup_count) - goto unlock; - - if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) - goto unlock; - - __get_cpu_var(vector_irq)[vector] = -1; - cfg->move_cleanup_count--; -unlock: - spin_unlock(&desc->lock); - } - - irq_exit(); -} - -static void irq_complete_move(unsigned int irq) -{ - struct irq_cfg *cfg = irq_cfg(irq); - unsigned vector, me; - - if (likely(!cfg->move_in_progress)) - return; - - vector = ~get_irq_regs()->orig_ax; - me = smp_processor_id(); - if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { - cpumask_t cleanup_mask; - - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); - cfg->move_cleanup_count = cpus_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - cfg->move_in_progress = 0; - } -} -#else -static inline void irq_complete_move(unsigned int irq) {} -#endif -#ifdef CONFIG_INTR_REMAP -static void ack_x2apic_level(unsigned int irq) -{ - ack_x2APIC_irq(); -} - -static void ack_x2apic_edge(unsigned int irq) -{ - ack_x2APIC_irq(); -} -#endif - -static void ack_apic_edge(unsigned int irq) -{ - irq_complete_move(irq); - move_native_irq(irq); - ack_APIC_irq(); -} - -#ifdef CONFIG_X86_64 -static void ack_apic_level(unsigned int irq) -{ - int do_unmask_irq = 0; - - irq_complete_move(irq); -#ifdef CONFIG_GENERIC_PENDING_IRQ - /* If we are moving the irq we need to mask it */ - if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { - do_unmask_irq = 1; - mask_IO_APIC_irq(irq); - } -#endif - - /* - * We must acknowledge the irq before we move it or the acknowledge will - * not propagate properly. - */ - ack_APIC_irq(); - - /* Now we can move and renable the irq */ - if (unlikely(do_unmask_irq)) { - /* Only migrate the irq if the ack has been received. - * - * On rare occasions the broadcast level triggered ack gets - * delayed going to ioapics, and if we reprogram the - * vector while Remote IRR is still set the irq will never - * fire again. - * - * To prevent this scenario we read the Remote IRR bit - * of the ioapic. This has two effects. - * - On any sane system the read of the ioapic will - * flush writes (and acks) going to the ioapic from - * this cpu. - * - We get to see if the ACK has actually been delivered. - * - * Based on failed experiments of reprogramming the - * ioapic entry from outside of irq context starting - * with masking the ioapic entry and then polling until - * Remote IRR was clear before reprogramming the - * ioapic I don't trust the Remote IRR bit to be - * completey accurate. - * - * However there appears to be no other way to plug - * this race, so if the Remote IRR bit is not - * accurate and is causing problems then it is a hardware bug - * and you can go talk to the chipset vendor about it. - */ - if (!io_apic_level_ack_pending(irq)) - move_masked_irq(irq); - unmask_IO_APIC_irq(irq); - } -} -#else -atomic_t irq_mis_count; -static void ack_apic_level(unsigned int irq) -{ - unsigned long v; - int i; - - irq_complete_move(irq); - move_native_irq(irq); - /* - * It appears there is an erratum which affects at least version 0x11 - * of I/O APIC (that's the 82093AA and cores integrated into various - * chipsets). Under certain conditions a level-triggered interrupt is - * erroneously delivered as edge-triggered one but the respective IRR - * bit gets set nevertheless. As a result the I/O unit expects an EOI - * message but it will never arrive and further interrupts are blocked - * from the source. The exact reason is so far unknown, but the - * phenomenon was observed when two consecutive interrupt requests - * from a given source get delivered to the same CPU and the source is - * temporarily disabled in between. - * - * A workaround is to simulate an EOI message manually. We achieve it - * by setting the trigger mode to edge and then to level when the edge - * trigger mode gets detected in the TMR of a local APIC for a - * level-triggered interrupt. We mask the source for the time of the - * operation to prevent an edge-triggered interrupt escaping meanwhile. - * The idea is from Manfred Spraul. --macro - */ - i = irq_cfg(irq)->vector; - - v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); - - ack_APIC_irq(); - - if (!(v & (1 << (i & 0x1f)))) { - atomic_inc(&irq_mis_count); - spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(irq); - __unmask_and_level_IO_APIC_irq(irq); - spin_unlock(&ioapic_lock); - } -} -#endif - -static struct irq_chip ioapic_chip __read_mostly = { - .name = "IO-APIC", - .startup = startup_ioapic_irq, - .mask = mask_IO_APIC_irq, - .unmask = unmask_IO_APIC_irq, - .ack = ack_apic_edge, - .eoi = ack_apic_level, -#ifdef CONFIG_SMP - .set_affinity = set_ioapic_affinity_irq, -#endif - .retrigger = ioapic_retrigger_irq, -}; - -#ifdef CONFIG_INTR_REMAP -static struct irq_chip ir_ioapic_chip __read_mostly = { - .name = "IR-IO-APIC", - .startup = startup_ioapic_irq, - .mask = mask_IO_APIC_irq, - .unmask = unmask_IO_APIC_irq, - .ack = ack_x2apic_edge, - .eoi = ack_x2apic_level, -#ifdef CONFIG_SMP - .set_affinity = set_ir_ioapic_affinity_irq, -#endif - .retrigger = ioapic_retrigger_irq, -}; -#endif - -static inline void init_IO_APIC_traps(void) -{ - int irq; - struct irq_desc *desc; - struct irq_cfg *cfg; - - /* - * NOTE! The local APIC isn't very good at handling - * multiple interrupts at the same interrupt level. - * As the interrupt level is determined by taking the - * vector number and shifting that right by 4, we - * want to spread these out a bit so that they don't - * all fall in the same interrupt level. - * - * Also, we've got to be careful not to trash gate - * 0x80, because int 0x80 is hm, kind of importantish. ;) - */ - for_each_irq_cfg(cfg) { - irq = cfg->irq; - if (IO_APIC_IRQ(irq) && !cfg->vector) { - /* - * Hmm.. We don't have an entry for this, - * so default to an old-fashioned 8259 - * interrupt if we can.. - */ - if (irq < 16) - make_8259A_irq(irq); - else { - desc = irq_to_desc(irq); - /* Strange. Oh, well.. */ - desc->chip = &no_irq_chip; - } - } - } -} - -/* - * The local APIC irq-chip implementation: - */ - -static void mask_lapic_irq(unsigned int irq) -{ - unsigned long v; - - v = apic_read(APIC_LVT0); - apic_write(APIC_LVT0, v | APIC_LVT_MASKED); -} - -static void unmask_lapic_irq(unsigned int irq) -{ - unsigned long v; - - v = apic_read(APIC_LVT0); - apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); -} - -static void ack_lapic_irq (unsigned int irq) -{ - ack_APIC_irq(); -} - -static struct irq_chip lapic_chip __read_mostly = { - .name = "local-APIC", - .mask = mask_lapic_irq, - .unmask = unmask_lapic_irq, - .ack = ack_lapic_irq, -}; - -static void lapic_register_intr(int irq) -{ - struct irq_desc *desc; - - desc = irq_to_desc(irq); - desc->status &= ~IRQ_LEVEL; - set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, - "edge"); -} - -static void __init setup_nmi(void) -{ - /* - * Dirty trick to enable the NMI watchdog ... - * We put the 8259A master into AEOI mode and - * unmask on all local APICs LVT0 as NMI. - * - * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') - * is from Maciej W. Rozycki - so we do not have to EOI from - * the NMI handler or the timer interrupt. - */ - apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); - - enable_NMI_through_LVT0(); - - apic_printk(APIC_VERBOSE, " done.\n"); -} - -/* - * This looks a bit hackish but it's about the only one way of sending - * a few INTA cycles to 8259As and any associated glue logic. ICR does - * not support the ExtINT mode, unfortunately. We need to send these - * cycles as some i82489DX-based boards have glue logic that keeps the - * 8259A interrupt line asserted until INTA. --macro - */ -static inline void __init unlock_ExtINT_logic(void) -{ - int apic, pin, i; - struct IO_APIC_route_entry entry0, entry1; - unsigned char save_control, save_freq_select; - - pin = find_isa_irq_pin(8, mp_INT); - if (pin == -1) { - WARN_ON_ONCE(1); - return; - } - apic = find_isa_irq_apic(8, mp_INT); - if (apic == -1) { - WARN_ON_ONCE(1); - return; - } - - entry0 = ioapic_read_entry(apic, pin); - clear_IO_APIC_pin(apic, pin); - - memset(&entry1, 0, sizeof(entry1)); - - entry1.dest_mode = 0; /* physical delivery */ - entry1.mask = 0; /* unmask IRQ now */ - entry1.dest = hard_smp_processor_id(); - entry1.delivery_mode = dest_ExtINT; - entry1.polarity = entry0.polarity; - entry1.trigger = 0; - entry1.vector = 0; - - ioapic_write_entry(apic, pin, entry1); - - save_control = CMOS_READ(RTC_CONTROL); - save_freq_select = CMOS_READ(RTC_FREQ_SELECT); - CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, - RTC_FREQ_SELECT); - CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); - - i = 100; - while (i-- > 0) { - mdelay(10); - if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) - i -= 10; - } - - CMOS_WRITE(save_control, RTC_CONTROL); - CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); - clear_IO_APIC_pin(apic, pin); - - ioapic_write_entry(apic, pin, entry0); -} - -static int disable_timer_pin_1 __initdata; -/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ -static int __init disable_timer_pin_setup(char *arg) -{ - disable_timer_pin_1 = 1; - return 0; -} -early_param("disable_timer_pin_1", disable_timer_pin_setup); - -int timer_through_8259 __initdata; - -/* - * This code may look a bit paranoid, but it's supposed to cooperate with - * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ - * is so screwy. Thanks to Brian Perkins for testing/hacking this beast - * fanatically on his truly buggy board. - * - * FIXME: really need to revamp this for all platforms. - */ -static inline void __init check_timer(void) -{ - struct irq_cfg *cfg = irq_cfg(0); - int apic1, pin1, apic2, pin2; - unsigned long flags; - unsigned int ver; - int no_pin1 = 0; - - local_irq_save(flags); - - ver = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(ver); - - /* - * get/set the timer IRQ vector: - */ - disable_8259A_irq(0); - assign_irq_vector(0, TARGET_CPUS); - - /* - * As IRQ0 is to be enabled in the 8259A, the virtual - * wire has to be disabled in the local APIC. Also - * timer interrupts need to be acknowledged manually in - * the 8259A for the i82489DX when using the NMI - * watchdog as that APIC treats NMIs as level-triggered. - * The AEOI mode will finish them in the 8259A - * automatically. - */ - apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); - init_8259A(1); -#ifdef CONFIG_X86_32 - timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); -#endif - - pin1 = find_isa_irq_pin(0, mp_INT); - apic1 = find_isa_irq_apic(0, mp_INT); - pin2 = ioapic_i8259.pin; - apic2 = ioapic_i8259.apic; - - apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " - "apic1=%d pin1=%d apic2=%d pin2=%d\n", - cfg->vector, apic1, pin1, apic2, pin2); - - /* - * Some BIOS writers are clueless and report the ExtINTA - * I/O APIC input from the cascaded 8259A as the timer - * interrupt input. So just in case, if only one pin - * was found above, try it both directly and through the - * 8259A. - */ - if (pin1 == -1) { -#ifdef CONFIG_INTR_REMAP - if (intr_remapping_enabled) - panic("BIOS bug: timer not connected to IO-APIC"); -#endif - pin1 = pin2; - apic1 = apic2; - no_pin1 = 1; - } else if (pin2 == -1) { - pin2 = pin1; - apic2 = apic1; - } - - if (pin1 != -1) { - /* - * Ok, does IRQ0 through the IOAPIC work? - */ - if (no_pin1) { - add_pin_to_irq(0, apic1, pin1); - setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); - } - unmask_IO_APIC_irq(0); - if (timer_irq_works()) { - if (nmi_watchdog == NMI_IO_APIC) { - setup_nmi(); - enable_8259A_irq(0); - } - if (disable_timer_pin_1 > 0) - clear_IO_APIC_pin(0, pin1); - goto out; - } -#ifdef CONFIG_INTR_REMAP - if (intr_remapping_enabled) - panic("timer doesn't work through Interrupt-remapped IO-APIC"); -#endif - clear_IO_APIC_pin(apic1, pin1); - if (!no_pin1) - apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " - "8254 timer not connected to IO-APIC\n"); - - apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer " - "(IRQ0) through the 8259A ...\n"); - apic_printk(APIC_QUIET, KERN_INFO - "..... (found apic %d pin %d) ...\n", apic2, pin2); - /* - * legacy devices should be connected to IO APIC #0 - */ - replace_pin_at_irq(0, apic1, pin1, apic2, pin2); - setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); - unmask_IO_APIC_irq(0); - enable_8259A_irq(0); - if (timer_irq_works()) { - apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); - timer_through_8259 = 1; - if (nmi_watchdog == NMI_IO_APIC) { - disable_8259A_irq(0); - setup_nmi(); - enable_8259A_irq(0); - } - goto out; - } - /* - * Cleanup, just in case ... - */ - disable_8259A_irq(0); - clear_IO_APIC_pin(apic2, pin2); - apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); - } - - if (nmi_watchdog == NMI_IO_APIC) { - apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work " - "through the IO-APIC - disabling NMI Watchdog!\n"); - nmi_watchdog = NMI_NONE; - } -#ifdef CONFIG_X86_32 - timer_ack = 0; -#endif - - apic_printk(APIC_QUIET, KERN_INFO - "...trying to set up timer as Virtual Wire IRQ...\n"); - - lapic_register_intr(0); - apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ - enable_8259A_irq(0); - - if (timer_irq_works()) { - apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); - goto out; - } - disable_8259A_irq(0); - apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); - apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); - - apic_printk(APIC_QUIET, KERN_INFO - "...trying to set up timer as ExtINT IRQ...\n"); - - init_8259A(0); - make_8259A_irq(0); - apic_write(APIC_LVT0, APIC_DM_EXTINT); - - unlock_ExtINT_logic(); - - if (timer_irq_works()) { - apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); - goto out; - } - apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); - panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " - "report. Then try booting with the 'noapic' option.\n"); -out: - local_irq_restore(flags); -} - -/* - * Traditionally ISA IRQ2 is the cascade IRQ, and is not available - * to devices. However there may be an I/O APIC pin available for - * this interrupt regardless. The pin may be left unconnected, but - * typically it will be reused as an ExtINT cascade interrupt for - * the master 8259A. In the MPS case such a pin will normally be - * reported as an ExtINT interrupt in the MP table. With ACPI - * there is no provision for ExtINT interrupts, and in the absence - * of an override it would be treated as an ordinary ISA I/O APIC - * interrupt, that is edge-triggered and unmasked by default. We - * used to do this, but it caused problems on some systems because - * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using - * the same ExtINT cascade interrupt to drive the local APIC of the - * bootstrap processor. Therefore we refrain from routing IRQ2 to - * the I/O APIC in all cases now. No actual device should request - * it anyway. --macro - */ -#define PIC_IRQS (1 << PIC_CASCADE_IR) - -void __init setup_IO_APIC(void) -{ - -#ifdef CONFIG_X86_32 - enable_IO_APIC(); -#else - /* - * calling enable_IO_APIC() is moved to setup_local_APIC for BP - */ -#endif - - io_apic_irqs = ~PIC_IRQS; - - apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); - /* - * Set up IO-APIC IRQ routing. - */ -#ifdef CONFIG_X86_32 - if (!acpi_ioapic) - setup_ioapic_ids_from_mpc(); -#endif - sync_Arb_IDs(); - setup_IO_APIC_irqs(); - init_IO_APIC_traps(); - check_timer(); -} - -/* - * Called after all the initialization is done. If we didnt find any - * APIC bugs then we can allow the modify fast path - */ - -static int __init io_apic_bug_finalize(void) -{ - if (sis_apic_bug == -1) - sis_apic_bug = 0; - return 0; -} - -late_initcall(io_apic_bug_finalize); - -struct sysfs_ioapic_data { - struct sys_device dev; - struct IO_APIC_route_entry entry[0]; -}; -static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; - -static int ioapic_suspend(struct sys_device *dev, pm_message_t state) -{ - struct IO_APIC_route_entry *entry; - struct sysfs_ioapic_data *data; - int i; - - data = container_of(dev, struct sysfs_ioapic_data, dev); - entry = data->entry; - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) - *entry = ioapic_read_entry(dev->id, i); - - return 0; -} - -static int ioapic_resume(struct sys_device *dev) -{ - struct IO_APIC_route_entry *entry; - struct sysfs_ioapic_data *data; - unsigned long flags; - union IO_APIC_reg_00 reg_00; - int i; - - data = container_of(dev, struct sysfs_ioapic_data, dev); - entry = data->entry; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(dev->id, 0); - if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) { - reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid; - io_apic_write(dev->id, 0, reg_00.raw); - } - spin_unlock_irqrestore(&ioapic_lock, flags); - for (i = 0; i < nr_ioapic_registers[dev->id]; i++) - ioapic_write_entry(dev->id, i, entry[i]); - - return 0; -} - -static struct sysdev_class ioapic_sysdev_class = { - .name = "ioapic", - .suspend = ioapic_suspend, - .resume = ioapic_resume, -}; - -static int __init ioapic_init_sysfs(void) -{ - struct sys_device * dev; - int i, size, error; - - error = sysdev_class_register(&ioapic_sysdev_class); - if (error) - return error; - - for (i = 0; i < nr_ioapics; i++ ) { - size = sizeof(struct sys_device) + nr_ioapic_registers[i] - * sizeof(struct IO_APIC_route_entry); - mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL); - if (!mp_ioapic_data[i]) { - printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); - continue; - } - dev = &mp_ioapic_data[i]->dev; - dev->id = i; - dev->cls = &ioapic_sysdev_class; - error = sysdev_register(dev); - if (error) { - kfree(mp_ioapic_data[i]); - mp_ioapic_data[i] = NULL; - printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); - continue; - } - } - - return 0; -} - -device_initcall(ioapic_init_sysfs); - -/* - * Dynamic irq allocate and deallocation - */ -unsigned int create_irq_nr(unsigned int irq_want) -{ - /* Allocate an unused irq */ - unsigned int irq; - unsigned int new; - unsigned long flags; - struct irq_cfg *cfg_new; - -#ifndef CONFIG_HAVE_SPARSE_IRQ - irq_want = nr_irqs - 1; -#endif - - irq = 0; - spin_lock_irqsave(&vector_lock, flags); - for (new = irq_want; new > 0; new--) { - if (platform_legacy_irq(new)) - continue; - cfg_new = irq_cfg(new); - if (cfg_new && cfg_new->vector != 0) - continue; - /* check if need to create one */ - if (!cfg_new) - cfg_new = irq_cfg_alloc(new); - if (__assign_irq_vector(new, TARGET_CPUS) == 0) - irq = new; - break; - } - spin_unlock_irqrestore(&vector_lock, flags); - - if (irq > 0) { - dynamic_irq_init(irq); - } - return irq; -} - -int create_irq(void) -{ - int irq; - - irq = create_irq_nr(nr_irqs - 1); - - if (irq == 0) - irq = -1; - - return irq; -} - -void destroy_irq(unsigned int irq) -{ - unsigned long flags; - - dynamic_irq_cleanup(irq); - -#ifdef CONFIG_INTR_REMAP - free_irte(irq); -#endif - spin_lock_irqsave(&vector_lock, flags); - __clear_irq_vector(irq); - spin_unlock_irqrestore(&vector_lock, flags); -} - -/* - * MSI message composition - */ -#ifdef CONFIG_PCI_MSI -static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) -{ - struct irq_cfg *cfg; - int err; - unsigned dest; - cpumask_t tmp; - - tmp = TARGET_CPUS; - err = assign_irq_vector(irq, tmp); - if (err) - return err; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, tmp); - dest = cpu_mask_to_apicid(tmp); - -#ifdef CONFIG_INTR_REMAP - if (irq_remapped(irq)) { - struct irte irte; - int ir_index; - u16 sub_handle; - - ir_index = map_irq_to_irte_handle(irq, &sub_handle); - BUG_ON(ir_index == -1); - - memset (&irte, 0, sizeof(irte)); - - irte.present = 1; - irte.dst_mode = INT_DEST_MODE; - irte.trigger_mode = 0; /* edge */ - irte.dlvry_mode = INT_DELIVERY_MODE; - irte.vector = cfg->vector; - irte.dest_id = IRTE_DEST(dest); - - modify_irte(irq, &irte); - - msg->address_hi = MSI_ADDR_BASE_HI; - msg->data = sub_handle; - msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT | - MSI_ADDR_IR_SHV | - MSI_ADDR_IR_INDEX1(ir_index) | - MSI_ADDR_IR_INDEX2(ir_index); - } else -#endif - { - msg->address_hi = MSI_ADDR_BASE_HI; - msg->address_lo = - MSI_ADDR_BASE_LO | - ((INT_DEST_MODE == 0) ? - MSI_ADDR_DEST_MODE_PHYSICAL: - MSI_ADDR_DEST_MODE_LOGICAL) | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? - MSI_ADDR_REDIRECTION_CPU: - MSI_ADDR_REDIRECTION_LOWPRI) | - MSI_ADDR_DEST_ID(dest); - - msg->data = - MSI_DATA_TRIGGER_EDGE | - MSI_DATA_LEVEL_ASSERT | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? - MSI_DATA_DELIVERY_FIXED: - MSI_DATA_DELIVERY_LOWPRI) | - MSI_DATA_VECTOR(cfg->vector); - } - return err; -} - -#ifdef CONFIG_SMP -static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) -{ - struct irq_cfg *cfg; - struct msi_msg msg; - unsigned int dest; - cpumask_t tmp; - struct irq_desc *desc; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; - - if (assign_irq_vector(irq, mask)) - return; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - - read_msi_msg(irq, &msg); - - msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(cfg->vector); - msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg.address_lo |= MSI_ADDR_DEST_ID(dest); - - write_msi_msg(irq, &msg); - desc = irq_to_desc(irq); - desc->affinity = mask; -} - -#ifdef CONFIG_INTR_REMAP -/* - * Migrate the MSI irq to another cpumask. This migration is - * done in the process context using interrupt-remapping hardware. - */ -static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) -{ - struct irq_cfg *cfg; - unsigned int dest; - cpumask_t tmp, cleanup_mask; - struct irte irte; - struct irq_desc *desc; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; - - if (get_irte(irq, &irte)) - return; - - if (assign_irq_vector(irq, mask)) - return; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - - irte.vector = cfg->vector; - irte.dest_id = IRTE_DEST(dest); - - /* - * atomically update the IRTE with the new destination and vector. - */ - modify_irte(irq, &irte); - - /* - * After this point, all the interrupts will start arriving - * at the new destination. So, time to cleanup the previous - * vector allocation. - */ - if (cfg->move_in_progress) { - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); - cfg->move_cleanup_count = cpus_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - cfg->move_in_progress = 0; - } - - desc = irq_to_desc(irq); - desc->affinity = mask; -} -#endif -#endif /* CONFIG_SMP */ - -/* - * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, - * which implement the MSI or MSI-X Capability Structure. - */ -static struct irq_chip msi_chip = { - .name = "PCI-MSI", - .unmask = unmask_msi_irq, - .mask = mask_msi_irq, - .ack = ack_apic_edge, -#ifdef CONFIG_SMP - .set_affinity = set_msi_irq_affinity, -#endif - .retrigger = ioapic_retrigger_irq, -}; - -#ifdef CONFIG_INTR_REMAP -static struct irq_chip msi_ir_chip = { - .name = "IR-PCI-MSI", - .unmask = unmask_msi_irq, - .mask = mask_msi_irq, - .ack = ack_x2apic_edge, -#ifdef CONFIG_SMP - .set_affinity = ir_set_msi_irq_affinity, -#endif - .retrigger = ioapic_retrigger_irq, -}; - -/* - * Map the PCI dev to the corresponding remapping hardware unit - * and allocate 'nvec' consecutive interrupt-remapping table entries - * in it. - */ -static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) -{ - struct intel_iommu *iommu; - int index; - - iommu = map_dev_to_ir(dev); - if (!iommu) { - printk(KERN_ERR - "Unable to map PCI %s to iommu\n", pci_name(dev)); - return -ENOENT; - } - - index = alloc_irte(iommu, irq, nvec); - if (index < 0) { - printk(KERN_ERR - "Unable to allocate %d IRTE for PCI %s\n", nvec, - pci_name(dev)); - return -ENOSPC; - } - return index; -} -#endif - -static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) -{ - int ret; - struct msi_msg msg; - - ret = msi_compose_msg(dev, irq, &msg); - if (ret < 0) - return ret; - - set_irq_msi(irq, desc); - write_msi_msg(irq, &msg); - -#ifdef CONFIG_INTR_REMAP - if (irq_remapped(irq)) { - struct irq_desc *desc = irq_to_desc(irq); - /* - * irq migration in process context - */ - desc->status |= IRQ_MOVE_PCNTXT; - set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); - } else -#endif - set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); - - return 0; -} - -static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) -{ - unsigned int irq; - - irq = dev->bus->number; - irq <<= 8; - irq |= dev->devfn; - irq <<= 12; - - return irq; -} - -int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) -{ - unsigned int irq; - int ret; - unsigned int irq_want; - - irq_want = build_irq_for_pci_dev(dev) + 0x100; - - irq = create_irq_nr(irq_want); - if (irq == 0) - return -1; - -#ifdef CONFIG_INTR_REMAP - if (!intr_remapping_enabled) - goto no_ir; - - ret = msi_alloc_irte(dev, irq, 1); - if (ret < 0) - goto error; -no_ir: -#endif - ret = setup_msi_irq(dev, desc, irq); - if (ret < 0) { - destroy_irq(irq); - return ret; - } - return 0; - -#ifdef CONFIG_INTR_REMAP -error: - destroy_irq(irq); - return ret; -#endif -} - -int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) -{ - unsigned int irq; - int ret, sub_handle; - struct msi_desc *desc; - unsigned int irq_want; - -#ifdef CONFIG_INTR_REMAP - struct intel_iommu *iommu = 0; - int index = 0; -#endif - - irq_want = build_irq_for_pci_dev(dev) + 0x100; - sub_handle = 0; - list_for_each_entry(desc, &dev->msi_list, list) { - irq = create_irq_nr(irq_want--); - if (irq == 0) - return -1; -#ifdef CONFIG_INTR_REMAP - if (!intr_remapping_enabled) - goto no_ir; - - if (!sub_handle) { - /* - * allocate the consecutive block of IRTE's - * for 'nvec' - */ - index = msi_alloc_irte(dev, irq, nvec); - if (index < 0) { - ret = index; - goto error; - } - } else { - iommu = map_dev_to_ir(dev); - if (!iommu) { - ret = -ENOENT; - goto error; - } - /* - * setup the mapping between the irq and the IRTE - * base index, the sub_handle pointing to the - * appropriate interrupt remap table entry. - */ - set_irte_irq(irq, iommu, index, sub_handle); - } -no_ir: -#endif - ret = setup_msi_irq(dev, desc, irq); - if (ret < 0) - goto error; - sub_handle++; - } - return 0; - -error: - destroy_irq(irq); - return ret; -} - -void arch_teardown_msi_irq(unsigned int irq) -{ - destroy_irq(irq); -} - -#ifdef CONFIG_DMAR -#ifdef CONFIG_SMP -static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) -{ - struct irq_cfg *cfg; - struct msi_msg msg; - unsigned int dest; - cpumask_t tmp; - struct irq_desc *desc; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; - - if (assign_irq_vector(irq, mask)) - return; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - - dmar_msi_read(irq, &msg); - - msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(cfg->vector); - msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg.address_lo |= MSI_ADDR_DEST_ID(dest); - - dmar_msi_write(irq, &msg); - desc = irq_to_desc(irq); - desc->affinity = mask; -} -#endif /* CONFIG_SMP */ - -struct irq_chip dmar_msi_type = { - .name = "DMAR_MSI", - .unmask = dmar_msi_unmask, - .mask = dmar_msi_mask, - .ack = ack_apic_edge, -#ifdef CONFIG_SMP - .set_affinity = dmar_msi_set_affinity, -#endif - .retrigger = ioapic_retrigger_irq, -}; - -int arch_setup_dmar_msi(unsigned int irq) -{ - int ret; - struct msi_msg msg; - - ret = msi_compose_msg(NULL, irq, &msg); - if (ret < 0) - return ret; - dmar_msi_write(irq, &msg); - set_irq_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, - "edge"); - return 0; -} -#endif - -#endif /* CONFIG_PCI_MSI */ -/* - * Hypertransport interrupt support - */ -#ifdef CONFIG_HT_IRQ - -#ifdef CONFIG_SMP - -static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) -{ - struct ht_irq_msg msg; - fetch_ht_irq_msg(irq, &msg); - - msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK); - msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); - - msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest); - msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); - - write_ht_irq_msg(irq, &msg); -} - -static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) -{ - struct irq_cfg *cfg; - unsigned int dest; - cpumask_t tmp; - struct irq_desc *desc; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; - - if (assign_irq_vector(irq, mask)) - return; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - - target_ht_irq(irq, dest, cfg->vector); - desc = irq_to_desc(irq); - desc->affinity = mask; -} -#endif - -static struct irq_chip ht_irq_chip = { - .name = "PCI-HT", - .mask = mask_ht_irq, - .unmask = unmask_ht_irq, - .ack = ack_apic_edge, -#ifdef CONFIG_SMP - .set_affinity = set_ht_irq_affinity, -#endif - .retrigger = ioapic_retrigger_irq, -}; - -int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) -{ - struct irq_cfg *cfg; - int err; - cpumask_t tmp; - - tmp = TARGET_CPUS; - err = assign_irq_vector(irq, tmp); - if (!err) { - struct ht_irq_msg msg; - unsigned dest; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, tmp); - dest = cpu_mask_to_apicid(tmp); - - msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); - - msg.address_lo = - HT_IRQ_LOW_BASE | - HT_IRQ_LOW_DEST_ID(dest) | - HT_IRQ_LOW_VECTOR(cfg->vector) | - ((INT_DEST_MODE == 0) ? - HT_IRQ_LOW_DM_PHYSICAL : - HT_IRQ_LOW_DM_LOGICAL) | - HT_IRQ_LOW_RQEOI_EDGE | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? - HT_IRQ_LOW_MT_FIXED : - HT_IRQ_LOW_MT_ARBITRATED) | - HT_IRQ_LOW_IRQ_MASKED; - - write_ht_irq_msg(irq, &msg); - - set_irq_chip_and_handler_name(irq, &ht_irq_chip, - handle_edge_irq, "edge"); - } - return err; -} -#endif /* CONFIG_HT_IRQ */ - -/* -------------------------------------------------------------------------- - ACPI-based IOAPIC Configuration - -------------------------------------------------------------------------- */ - -#ifdef CONFIG_ACPI - -#ifdef CONFIG_X86_32 -int __init io_apic_get_unique_id(int ioapic, int apic_id) -{ - union IO_APIC_reg_00 reg_00; - static physid_mask_t apic_id_map = PHYSID_MASK_NONE; - physid_mask_t tmp; - unsigned long flags; - int i = 0; - - /* - * The P4 platform supports up to 256 APIC IDs on two separate APIC - * buses (one for LAPICs, one for IOAPICs), where predecessors only - * supports up to 16 on one shared APIC bus. - * - * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full - * advantage of new APIC bus architecture. - */ - - if (physids_empty(apic_id_map)) - apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(ioapic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - if (apic_id >= get_physical_broadcast()) { - printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " - "%d\n", ioapic, apic_id, reg_00.bits.ID); - apic_id = reg_00.bits.ID; - } - - /* - * Every APIC in a system must have a unique ID or we get lots of nice - * 'stuck on smp_invalidate_needed IPI wait' messages. - */ - if (check_apicid_used(apic_id_map, apic_id)) { - - for (i = 0; i < get_physical_broadcast(); i++) { - if (!check_apicid_used(apic_id_map, i)) - break; - } - - if (i == get_physical_broadcast()) - panic("Max apic_id exceeded!\n"); - - printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " - "trying %d\n", ioapic, apic_id, i); - - apic_id = i; - } - - tmp = apicid_to_cpu_present(apic_id); - physids_or(apic_id_map, apic_id_map, tmp); - - if (reg_00.bits.ID != apic_id) { - reg_00.bits.ID = apic_id; - - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic, 0, reg_00.raw); - reg_00.raw = io_apic_read(ioapic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - /* Sanity check */ - if (reg_00.bits.ID != apic_id) { - printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); - return -1; - } - } - - apic_printk(APIC_VERBOSE, KERN_INFO - "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); - - return apic_id; -} - -int __init io_apic_get_version(int ioapic) -{ - union IO_APIC_reg_01 reg_01; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return reg_01.bits.version; -} -#endif - -int __init io_apic_get_redir_entries (int ioapic) -{ - union IO_APIC_reg_01 reg_01; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return reg_01.bits.entries; -} - - -int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) -{ - if (!IO_APIC_IRQ(irq)) { - apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", - ioapic); - return -EINVAL; - } - - /* - * IRQs < 16 are already in the irq_2_pin[] map - */ - if (irq >= 16) - add_pin_to_irq(irq, ioapic, pin); - - setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity); - - return 0; -} - - -int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) -{ - int i; - - if (skip_ioapic_setup) - return -1; - - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mp_irqtype == mp_INT && - mp_irqs[i].mp_srcbusirq == bus_irq) - break; - if (i >= mp_irq_entries) - return -1; - - *trigger = irq_trigger(i); - *polarity = irq_polarity(i); - return 0; -} - -#endif /* CONFIG_ACPI */ - -/* - * This function currently is only a helper for the i386 smp boot process where - * we need to reprogram the ioredtbls to cater for the cpus which have come online - * so mask in all cases should simply be TARGET_CPUS - */ -#ifdef CONFIG_SMP -void __init setup_ioapic_dest(void) -{ - int pin, ioapic, irq, irq_entry; - struct irq_cfg *cfg; - - if (skip_ioapic_setup == 1) - return; - - for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { - for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { - irq_entry = find_irq_entry(ioapic, pin, mp_INT); - if (irq_entry == -1) - continue; - irq = pin_2_irq(irq_entry, ioapic, pin); - - /* setup_IO_APIC_irqs could fail to get vector for some device - * when you have too many devices, because at that time only boot - * cpu is online. - */ - cfg = irq_cfg(irq); - if (!cfg->vector) - setup_IO_APIC_irq(ioapic, pin, irq, - irq_trigger(irq_entry), - irq_polarity(irq_entry)); -#ifdef CONFIG_INTR_REMAP - else if (intr_remapping_enabled) - set_ir_ioapic_affinity_irq(irq, TARGET_CPUS); -#endif - else - set_ioapic_affinity_irq(irq, TARGET_CPUS); - } - - } -} -#endif - -#ifdef CONFIG_X86_64 -#define IOAPIC_RESOURCE_NAME_SIZE 11 - -static struct resource *ioapic_resources; - -static struct resource * __init ioapic_setup_resources(void) -{ - unsigned long n; - struct resource *res; - char *mem; - int i; - - if (nr_ioapics <= 0) - return NULL; - - n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); - n *= nr_ioapics; - - mem = alloc_bootmem(n); - res = (void *)mem; - - if (mem != NULL) { - mem += sizeof(struct resource) * nr_ioapics; - - for (i = 0; i < nr_ioapics; i++) { - res[i].name = mem; - res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; - sprintf(mem, "IOAPIC %u", i); - mem += IOAPIC_RESOURCE_NAME_SIZE; - } - } - - ioapic_resources = res; - - return res; -} -#endif - -void __init ioapic_init_mappings(void) -{ - unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; - int i; -#ifdef CONFIG_X86_64 - struct resource *ioapic_res; - - ioapic_res = ioapic_setup_resources(); -#endif - for (i = 0; i < nr_ioapics; i++) { - if (smp_found_config) { - ioapic_phys = mp_ioapics[i].mp_apicaddr; -#ifdef CONFIG_X86_32 - if (!ioapic_phys) { - printk(KERN_ERR - "WARNING: bogus zero IO-APIC " - "address found in MPTABLE, " - "disabling IO/APIC support!\n"); - smp_found_config = 0; - skip_ioapic_setup = 1; - goto fake_ioapic_page; - } -#endif - } else { -#ifdef CONFIG_X86_32 -fake_ioapic_page: -#endif - ioapic_phys = (unsigned long) - alloc_bootmem_pages(PAGE_SIZE); - ioapic_phys = __pa(ioapic_phys); - } - set_fixmap_nocache(idx, ioapic_phys); - apic_printk(APIC_VERBOSE, - "mapped IOAPIC to %08lx (%08lx)\n", - __fix_to_virt(idx), ioapic_phys); - idx++; - -#ifdef CONFIG_X86_64 - if (ioapic_res != NULL) { - ioapic_res->start = ioapic_phys; - ioapic_res->end = ioapic_phys + (4 * 1024) - 1; - ioapic_res++; - } -#endif - } -} - -#ifdef CONFIG_X86_64 -static int __init ioapic_insert_resources(void) -{ - int i; - struct resource *r = ioapic_resources; - - if (!r) { - printk(KERN_ERR - "IO APIC resources could be not be allocated.\n"); - return -1; - } - - for (i = 0; i < nr_ioapics; i++) { - insert_resource(&iomem_resource, r); - r++; - } - - return 0; -} - -/* Insert the IO APIC resources after PCI initialization has occured to handle - * IO APICS that are mapped in on a BAR in PCI space. */ -late_initcall(ioapic_insert_resources); -#endif -- cgit v1.2.3-70-g09d2 From c691cc84529ec88ccb32b174535bb61875888c90 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:43 -0700 Subject: io_apic: make 32 bit have io_apic resource in /proc/iomem Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index fba6d6ee348..7e303e0967a 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -3802,7 +3802,6 @@ void __init setup_ioapic_dest(void) } #endif -#ifdef CONFIG_X86_64 #define IOAPIC_RESOURCE_NAME_SIZE 11 static struct resource *ioapic_resources; @@ -3838,17 +3837,14 @@ static struct resource * __init ioapic_setup_resources(void) return res; } -#endif void __init ioapic_init_mappings(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; int i; -#ifdef CONFIG_X86_64 struct resource *ioapic_res; ioapic_res = ioapic_setup_resources(); -#endif for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { ioapic_phys = mp_ioapics[i].mp_apicaddr; @@ -3877,17 +3873,14 @@ fake_ioapic_page: __fix_to_virt(idx), ioapic_phys); idx++; -#ifdef CONFIG_X86_64 if (ioapic_res != NULL) { ioapic_res->start = ioapic_phys; ioapic_res->end = ioapic_phys + (4 * 1024) - 1; ioapic_res++; } -#endif } } -#ifdef CONFIG_X86_64 static int __init ioapic_insert_resources(void) { int i; @@ -3910,4 +3903,3 @@ static int __init ioapic_insert_resources(void) /* Insert the IO APIC resources after PCI initialization has occured to handle * IO APICS that are mapped in on a BAR in PCI space. */ late_initcall(ioapic_insert_resources); -#endif -- cgit v1.2.3-70-g09d2 From 4e738e2f307113feaedebae147c3e0d072e39648 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:47 -0700 Subject: x86: unify mask_IO_APIC_irq use MACRO for 32 bit too Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 80 +++++++++++++---------------------------------- 1 file changed, 21 insertions(+), 59 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 7e303e0967a..099696109ca 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -610,18 +610,7 @@ static void __init replace_pin_at_irq(unsigned int irq, add_pin_to_irq(irq, newapic, newpin); } -#ifdef CONFIG_X86_64 -/* - * Synchronize the IO-APIC and the CPU by doing - * a dummy read from the IO-APIC - */ -static inline void io_apic_sync(unsigned int apic) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - readl(&io_apic->data); -} - -#define __DO_ACTION(R, ACTION, FINAL) \ +#define __DO_ACTION(R, ACTION_ENABLE, ACTION_DISABLE, FINAL) \ \ { \ int pin; \ @@ -636,7 +625,8 @@ static inline void io_apic_sync(unsigned int apic) break; \ pin = entry->pin; \ reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ - reg ACTION; \ + reg ACTION_DISABLE; \ + reg ACTION_ENABLE; \ io_apic_modify(entry->apic, 0x10 + R + pin*2, reg); \ FINAL; \ if (!entry->next) \ @@ -645,66 +635,38 @@ static inline void io_apic_sync(unsigned int apic) } \ } -#define DO_ACTION(name,R,ACTION, FINAL) \ +#define DO_ACTION(name,R, ACTION_ENABLE, ACTION_DISABLE, FINAL) \ \ static void name##_IO_APIC_irq (unsigned int irq) \ - __DO_ACTION(R, ACTION, FINAL) - -/* mask = 1 */ -DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic)) + __DO_ACTION(R, ACTION_ENABLE, ACTION_DISABLE, FINAL) /* mask = 0 */ -DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, ) +DO_ACTION(__unmask, 0, |= 0, &= ~IO_APIC_REDIR_MASKED, ) -#else - -static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable) +#ifdef CONFIG_X86_64 +/* + * Synchronize the IO-APIC and the CPU by doing + * a dummy read from the IO-APIC + */ +static inline void io_apic_sync(unsigned int apic) { - struct irq_cfg *cfg; - struct irq_pin_list *entry; - unsigned int pin, reg; - - cfg = irq_cfg(irq); - entry = cfg->irq_2_pin; - for (;;) { - if (!entry) - break; - pin = entry->pin; - reg = io_apic_read(entry->apic, 0x10 + pin*2); - reg &= ~disable; - reg |= enable; - io_apic_modify(entry->apic, 0x10 + pin*2, reg); - if (!entry->next) - break; - entry = entry->next; - } + struct io_apic __iomem *io_apic = io_apic_base(apic); + readl(&io_apic->data); } /* mask = 1 */ -static void __mask_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0); -} +DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, &= ~0, io_apic_sync(entry->apic)) -/* mask = 0 */ -static void __unmask_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED); -} +#else + +/* mask = 1 */ +DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, &= ~0, ) /* mask = 1, trigger = 0 */ -static void __mask_and_edge_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, - IO_APIC_REDIR_LEVEL_TRIGGER); -} +DO_ACTION(__mask_and_edge, 0, |= IO_APIC_REDIR_MASKED, &= ~IO_APIC_REDIR_LEVEL_TRIGGER, ) /* mask = 0, trigger = 1 */ -static void __unmask_and_level_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER, - IO_APIC_REDIR_MASKED); -} +DO_ACTION(__unmask_and_level, 0, |= IO_APIC_REDIR_LEVEL_TRIGGER, &= ~IO_APIC_REDIR_MASKED, ) #endif -- cgit v1.2.3-70-g09d2 From 3eb2cce84beae8fd41de950569cafd5bca7edd5d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:48 -0700 Subject: x86: unify ack_apic_edge use code in 64 to replace move_native_irq(irq, desc); in 32 bit Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 73 +++++++++++++++++++++++------------------------ 1 file changed, 35 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 099696109ca..a466b04ad5a 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -389,7 +389,6 @@ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned writel(value, &io_apic->data); } -#ifdef CONFIG_X86_64 static bool io_apic_level_ack_pending(unsigned int irq) { struct irq_pin_list *entry; @@ -419,7 +418,6 @@ static bool io_apic_level_ack_pending(unsigned int irq) return false; } -#endif union entry_union { struct { u32 w1, w2; }; @@ -2398,9 +2396,16 @@ static void ack_apic_edge(unsigned int irq) ack_APIC_irq(); } -#ifdef CONFIG_X86_64 +#ifdef CONFIG_X86_32 +atomic_t irq_mis_count; +#endif + static void ack_apic_level(unsigned int irq) { +#ifdef CONFIG_X86_32 + unsigned long v; + int i; +#endif int do_unmask_irq = 0; irq_complete_move(irq); @@ -2412,6 +2417,31 @@ static void ack_apic_level(unsigned int irq) } #endif +#ifdef CONFIG_X86_32 + /* + * It appears there is an erratum which affects at least version 0x11 + * of I/O APIC (that's the 82093AA and cores integrated into various + * chipsets). Under certain conditions a level-triggered interrupt is + * erroneously delivered as edge-triggered one but the respective IRR + * bit gets set nevertheless. As a result the I/O unit expects an EOI + * message but it will never arrive and further interrupts are blocked + * from the source. The exact reason is so far unknown, but the + * phenomenon was observed when two consecutive interrupt requests + * from a given source get delivered to the same CPU and the source is + * temporarily disabled in between. + * + * A workaround is to simulate an EOI message manually. We achieve it + * by setting the trigger mode to edge and then to level when the edge + * trigger mode gets detected in the TMR of a local APIC for a + * level-triggered interrupt. We mask the source for the time of the + * operation to prevent an edge-triggered interrupt escaping meanwhile. + * The idea is from Manfred Spraul. --macro + */ + i = irq_cfg(irq)->vector; + + v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); +#endif + /* * We must acknowledge the irq before we move it or the acknowledge will * not propagate properly. @@ -2450,41 +2480,8 @@ static void ack_apic_level(unsigned int irq) move_masked_irq(irq); unmask_IO_APIC_irq(irq); } -} -#else -atomic_t irq_mis_count; -static void ack_apic_level(unsigned int irq) -{ - unsigned long v; - int i; - - irq_complete_move(irq); - move_native_irq(irq); - /* - * It appears there is an erratum which affects at least version 0x11 - * of I/O APIC (that's the 82093AA and cores integrated into various - * chipsets). Under certain conditions a level-triggered interrupt is - * erroneously delivered as edge-triggered one but the respective IRR - * bit gets set nevertheless. As a result the I/O unit expects an EOI - * message but it will never arrive and further interrupts are blocked - * from the source. The exact reason is so far unknown, but the - * phenomenon was observed when two consecutive interrupt requests - * from a given source get delivered to the same CPU and the source is - * temporarily disabled in between. - * - * A workaround is to simulate an EOI message manually. We achieve it - * by setting the trigger mode to edge and then to level when the edge - * trigger mode gets detected in the TMR of a local APIC for a - * level-triggered interrupt. We mask the source for the time of the - * operation to prevent an edge-triggered interrupt escaping meanwhile. - * The idea is from Manfred Spraul. --macro - */ - i = irq_cfg(irq)->vector; - - v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); - - ack_APIC_irq(); +#ifdef CONFIG_X86_32 if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); spin_lock(&ioapic_lock); @@ -2492,8 +2489,8 @@ static void ack_apic_level(unsigned int irq) __unmask_and_level_IO_APIC_irq(irq); spin_unlock(&ioapic_lock); } -} #endif +} static struct irq_chip ioapic_chip __read_mostly = { .name = "IO-APIC", -- cgit v1.2.3-70-g09d2 From 29ccbbf232c035b8c7ff0c5060fbe30a66ed9b99 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:49 -0700 Subject: x86: remove first_free_entry/pin_map_size no user now Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 7 ------- arch/x86/kernel/setup.c | 4 ---- include/asm-x86/irq.h | 3 --- 3 files changed, 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index a466b04ad5a..5de2d38812a 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -72,13 +72,6 @@ int sis_apic_bug = -1; static DEFINE_SPINLOCK(ioapic_lock); static DEFINE_SPINLOCK(vector_lock); -int first_free_entry; -/* - * Rough estimation of how many shared IRQs there are, can - * be changed anytime. - */ -int pin_map_size; - /* * # of IRQ routing registers */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 02e3a669797..d90c659b70e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1074,10 +1074,6 @@ void __init setup_arch(char **cmdline_p) nr_irqs = 32 * nr_cpu_ids + 224; init_cpu_to_node(); #endif -#ifdef CONFIG_X86_IO_APIC - pin_map_size = nr_irqs * 2; - first_free_entry = nr_irqs; -#endif init_apic_mappings(); ioapic_init_mappings(); diff --git a/include/asm-x86/irq.h b/include/asm-x86/irq.h index 2a130c44f5d..1e5f2909c1d 100644 --- a/include/asm-x86/irq.h +++ b/include/asm-x86/irq.h @@ -10,9 +10,6 @@ #include #include -extern int pin_map_size; -extern int first_free_entry; - static inline int irq_canonicalize(int irq) { return ((irq == 2) ? 9 : irq); -- cgit v1.2.3-70-g09d2 From ffd5aae7817fba22c5c3e304a31c44fa0a4e9a97 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:50 -0700 Subject: x86: print local APIC of APs one by one instead of print that of all APs at the time Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 5de2d38812a..bf0e66d7303 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -1777,7 +1777,12 @@ __apicdebuginit(void) print_local_APIC(void *dummy) __apicdebuginit(void) print_all_local_APICs(void) { - on_each_cpu(print_local_APIC, NULL, 1); + int cpu; + + preempt_disable(); + for_each_online_cpu(cpu) + smp_call_function_single(cpu, print_local_APIC, NULL, 1); + preempt_enable(); } __apicdebuginit(void) print_PIC(void) -- cgit v1.2.3-70-g09d2 From 8f09cd20a24c5f13c571bc73ddcd47be0af3b70f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:51 -0700 Subject: x86: make HAVE_SPARSE_IRQ support selectable Ingo said sparse_irq is some intrusive. need to make it selectable to make it simple, remove irq_desc as parameter in some functions. (ack, eoi, set_affinity). may need to make member if irq_chip to take irq_desc, or struct irq later. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/Kconfig | 3 -- arch/x86/Kconfig | 12 +++++++- arch/x86/kernel/io_apic.c | 71 ++++++++++++++++++++++++++++++++--------------- arch/x86/kernel/irq_32.c | 2 +- arch/x86/kernel/irq_64.c | 2 +- 5 files changed, 62 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/Kconfig b/arch/Kconfig index b3676224626..c8a7c2eb649 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -106,6 +106,3 @@ config HAVE_CLK config HAVE_DYN_ARRAY def_bool n -config HAVE_SPARSE_IRQ - def_bool n - diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b157e94637b..600584b7a49 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -34,7 +34,6 @@ config X86 select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_DYN_ARRAY - select HAVE_SPARSE_IRQ config ARCH_DEFCONFIG string @@ -238,6 +237,17 @@ config SMP If you don't know what to do here, say N. +config HAVE_SPARSE_IRQ + bool "Support sparse irq numbering" + depends on PCI_MSI || HT_IRQ + default y + help + This enables support for sparse irq, esp for msi/msi-x. the irq + number will be bus/dev/fn + 12bit. You may need if you have lots of + cards supports msi-x installed. + + If you don't know what to do here, say Y. + config X86_FIND_SMP_CONFIG def_bool y depends on X86_MPPARSE || X86_VOYAGER diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index bf0e66d7303..f853b667fa5 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -107,7 +107,9 @@ struct irq_cfg; struct irq_pin_list; struct irq_cfg { unsigned int irq; +#ifdef CONFIG_HAVE_SPARSE_IRQ struct irq_cfg *next; +#endif struct irq_pin_list *irq_2_pin; cpumask_t domain; cpumask_t old_domain; @@ -137,20 +139,6 @@ static struct irq_cfg irq_cfg_legacy[] __initdata = { }; static struct irq_cfg irq_cfg_init = { .irq = -1U, }; -/* need to be biger than size of irq_cfg_legacy */ -static int nr_irq_cfg = 32; - -static int __init parse_nr_irq_cfg(char *arg) -{ - if (arg) { - nr_irq_cfg = simple_strtoul(arg, NULL, 0); - if (nr_irq_cfg < 32) - nr_irq_cfg = 32; - } - return 0; -} - -early_param("nr_irq_cfg", parse_nr_irq_cfg); static void init_one_irq_cfg(struct irq_cfg *cfg) { @@ -158,7 +146,9 @@ static void init_one_irq_cfg(struct irq_cfg *cfg) } static struct irq_cfg *irq_cfgx; +#ifdef CONFIG_HAVE_SPARSE_IRQ static struct irq_cfg *irq_cfgx_free; +#endif static void __init init_work(void *data) { struct dyn_array *da = data; @@ -174,15 +164,34 @@ static void __init init_work(void *data) for (i = legacy_count; i < *da->nr; i++) init_one_irq_cfg(&cfg[i]); +#ifdef CONFIG_HAVE_SPARSE_IRQ for (i = 1; i < *da->nr; i++) cfg[i-1].next = &cfg[i]; irq_cfgx_free = &irq_cfgx[legacy_count]; irq_cfgx[legacy_count - 1].next = NULL; +#endif +} + +#ifdef CONFIG_HAVE_SPARSE_IRQ +/* need to be biger than size of irq_cfg_legacy */ +static int nr_irq_cfg = 32; + +static int __init parse_nr_irq_cfg(char *arg) +{ + if (arg) { + nr_irq_cfg = simple_strtoul(arg, NULL, 0); + if (nr_irq_cfg < 32) + nr_irq_cfg = 32; + } + return 0; } -#define for_each_irq_cfg(cfg) \ - for (cfg = irq_cfgx; cfg; cfg = cfg->next) +early_param("nr_irq_cfg", parse_nr_irq_cfg); + +#define for_each_irq_cfg(irqX, cfg) \ + for (cfg = irq_cfgx, irqX = cfg->irq; cfg; cfg = cfg->next, irqX = cfg ? cfg->irq : -1U) + DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irq_cfg, PAGE_SIZE, init_work); @@ -273,7 +282,26 @@ static struct irq_cfg *irq_cfg_alloc(unsigned int irq) #endif return cfg; } +#else + +#define for_each_irq_cfg(irq, cfg) \ + for (irq = 0, cfg = &irq_cfgx[irq]; irq < nr_irqs; irq++, cfg = &irq_cfgx[irq]) + +DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irqs, PAGE_SIZE, init_work); +struct irq_cfg *irq_cfg(unsigned int irq) +{ + if (irq < nr_irqs) + return &irq_cfgx[irq]; + + return NULL; +} +struct irq_cfg *irq_cfg_alloc(unsigned int irq) +{ + return irq_cfg(irq); +} + +#endif /* * This is performance-critical, we want to do it O(1) * @@ -1282,11 +1310,10 @@ void __setup_vector_irq(int cpu) struct irq_cfg *cfg; /* Mark the inuse vectors */ - for_each_irq_cfg(cfg) { + for_each_irq_cfg(irq, cfg) { if (!cpu_isset(cpu, cfg->domain)) continue; vector = cfg->vector; - irq = cfg->irq; per_cpu(vector_irq, cpu)[vector] = irq; } /* Mark the free vectors */ @@ -1563,6 +1590,7 @@ __apicdebuginit(void) print_IO_APIC(void) union IO_APIC_reg_03 reg_03; unsigned long flags; struct irq_cfg *cfg; + unsigned int irq; if (apic_verbosity == APIC_QUIET) return; @@ -1651,11 +1679,11 @@ __apicdebuginit(void) print_IO_APIC(void) } } printk(KERN_DEBUG "IRQ to pin mappings:\n"); - for_each_irq_cfg(cfg) { + for_each_irq_cfg(irq, cfg) { struct irq_pin_list *entry = cfg->irq_2_pin; if (!entry) continue; - printk(KERN_DEBUG "IRQ%d ", cfg->irq); + printk(KERN_DEBUG "IRQ%d ", irq); for (;;) { printk("-> %d:%d", entry->apic, entry->pin); if (!entry->next) @@ -2535,8 +2563,7 @@ static inline void init_IO_APIC_traps(void) * Also, we've got to be careful not to trash gate * 0x80, because int 0x80 is hm, kind of importantish. ;) */ - for_each_irq_cfg(cfg) { - irq = cfg->irq; + for_each_irq_cfg(irq, cfg) { if (IO_APIC_IRQ(irq) && !cfg->vector) { /* * Hmm.. We don't have an entry for this, diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index cc929f2f84f..b2e1082cf5a 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -269,7 +269,7 @@ int show_interrupts(struct seq_file *p, void *v) struct irqaction * action; unsigned long flags; unsigned int entries; - struct irq_desc *desc; + struct irq_desc *desc = NULL; int tail = 0; #ifdef CONFIG_HAVE_SPARSE_IRQ diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 348a11168c2..c2fca89a3f7 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -74,7 +74,7 @@ int show_interrupts(struct seq_file *p, void *v) struct irqaction * action; unsigned long flags; unsigned int entries; - struct irq_desc *desc; + struct irq_desc *desc = NULL; int tail = 0; #ifdef CONFIG_HAVE_SPARSE_IRQ -- cgit v1.2.3-70-g09d2 From 9d6a4d0823b3b8e29156f5e698b5a68687afad32 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:52 -0700 Subject: x86: probe nr_irqs even only mptable is used for !CONFIG_HAVE_SPARSE_IRQ fix: In file included from arch/x86/kernel/early-quirks.c:18: include/asm/io_apic.h: In function 'probe_nr_irqs': include/asm/io_apic.h:209: error: 'NR_IRQS' undeclared (first use in this function) include/asm/io_apic.h:209: error: (Each undeclared identifier is reported only once include/asm/io_apic.h:209: error: for each function it appears in.) v2: fix by Ingo Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 25 ------------------------- arch/x86/kernel/io_apic.c | 43 ++++++++++++++++++++++++++++++------------- arch/x86/kernel/setup.c | 6 +++--- include/asm-x86/io_apic.h | 8 ++++++++ include/asm-x86/mpspec.h | 1 - 5 files changed, 41 insertions(+), 42 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 3e9d163fd92..5fef4fece4a 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -957,29 +957,6 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) nr_ioapics++; } -int get_nr_irqs_via_madt(void) -{ - int idx; - int nr = 0; - - for (idx = 0; idx < nr_ioapics; idx++) { - if (mp_ioapic_routing[idx].gsi_end > nr) - nr = mp_ioapic_routing[idx].gsi_end; - } - - nr++; - - /* double it for hotplug and msi and nmi */ - nr <<= 1; - - /* something wrong ? */ - if (nr < 32) - nr = 32; - - return nr; - -} - static void assign_to_mp_irq(struct mp_config_intsrc *m, struct mp_config_intsrc *mp_irq) { @@ -1278,8 +1255,6 @@ static int __init acpi_parse_madt_ioapic_entries(void) } - nr_irqs = get_nr_irqs_via_madt(); - count = acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr, nr_irqs); diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index f853b667fa5..f7e80262cbb 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -3596,6 +3596,36 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) } #endif /* CONFIG_HT_IRQ */ +int __init io_apic_get_redir_entries (int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return reg_01.bits.entries; +} + +int __init probe_nr_irqs(void) +{ + int idx; + int nr = 0; + + for (idx = 0; idx < nr_ioapics; idx++) + nr += io_apic_get_redir_entries(idx); + + /* double it for hotplug and msi and nmi */ + nr <<= 1; + + /* something wrong ? */ + if (nr < 32) + nr = 32; + + return nr; +} + /* -------------------------------------------------------------------------- ACPI-based IOAPIC Configuration -------------------------------------------------------------------------- */ @@ -3690,19 +3720,6 @@ int __init io_apic_get_version(int ioapic) } #endif -int __init io_apic_get_redir_entries (int ioapic) -{ - union IO_APIC_reg_01 reg_01; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return reg_01.bits.entries; -} - - int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) { if (!IO_APIC_IRQ(irq)) { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d90c659b70e..61335306696 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1069,15 +1069,15 @@ void __init setup_arch(char **cmdline_p) prefill_possible_map(); #ifdef CONFIG_X86_64 - /* need to wait for nr_cpu_ids settle down */ - if (nr_irqs == NR_IRQS) - nr_irqs = 32 * nr_cpu_ids + 224; init_cpu_to_node(); #endif init_apic_mappings(); ioapic_init_mappings(); + /* need to wait for io_apic is mapped */ + nr_irqs = probe_nr_irqs(); + kvm_guest_init(); e820_reserve_resources(); diff --git a/include/asm-x86/io_apic.h b/include/asm-x86/io_apic.h index ce818292d2c..d35cbd7aa58 100644 --- a/include/asm-x86/io_apic.h +++ b/include/asm-x86/io_apic.h @@ -4,6 +4,7 @@ #include #include #include +#include /* * Intel IO-APIC support for SMP and UP systems. @@ -187,10 +188,17 @@ extern void restore_IO_APIC_setup(void); extern void reinit_intr_remapped_IO_APIC(int); #endif +extern int probe_nr_irqs(void); + #else /* !CONFIG_X86_IO_APIC */ #define io_apic_assign_pci_irqs 0 static const int timer_through_8259 = 0; static inline void ioapic_init_mappings(void) { } + +static inline int probe_nr_irqs(void) +{ + return NR_IRQS; +} #endif #endif /* ASM_X86__IO_APIC_H */ diff --git a/include/asm-x86/mpspec.h b/include/asm-x86/mpspec.h index a0748021250..be2241a818f 100644 --- a/include/asm-x86/mpspec.h +++ b/include/asm-x86/mpspec.h @@ -60,7 +60,6 @@ extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi); extern void mp_config_acpi_legacy_irqs(void); extern int mp_register_gsi(u32 gsi, int edge_level, int active_high_low); -extern int get_nr_irqs_via_madt(void); #ifdef CONFIG_X86_IO_APIC extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, u32 gsi, int triggering, int polarity); -- cgit v1.2.3-70-g09d2 From bf9d3cf73e8caf0b3ae0b7f508a9f251536f5ff4 Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Mon, 18 Aug 2008 22:17:08 -0700 Subject: xen: Fix bug `do_IRQ: cannot handle IRQ -1 vector 0x6 cpu 1' Following commit 9c3f2468d8339866d9ef6a25aae31a8909c6be0d, do_IRQ() looks up the IRQ number in the per-cpu variable vector_irq. This commit makes Xen initialise an identity vector_irq map for both X86_32 and X86_64. Signed-off-by: Alex Nixon Acked-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/irq.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 28b85ab8422..bb042608c60 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -21,7 +21,6 @@ void xen_force_evtchn_callback(void) static void __init __xen_init_IRQ(void) { -#ifdef CONFIG_X86_64 int i; /* Create identity vector->irq map */ @@ -31,7 +30,6 @@ static void __init __xen_init_IRQ(void) for_each_possible_cpu(cpu) per_cpu(vector_irq, cpu)[i] = i; } -#endif /* CONFIG_X86_64 */ xen_init_IRQ(); } -- cgit v1.2.3-70-g09d2 From 0c425cec64eb0c0d0dd7037c21a25585cbe3636c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 18 Aug 2008 13:04:26 +0200 Subject: warning: fix arch x86 kernel io_apic c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix warning: arch/x86/kernel/io_apic.c: In function ‘print_local_APIC’: arch/x86/kernel/io_apic.c:1786: warning: format ‘%08x’ expects type ‘unsigned int’, but argument 2 has type ‘u64’ arch/x86/kernel/io_apic.c:1787: warning: format ‘%08x’ expects type ‘unsigned int’, but argument 2 has type ‘u64’ By creating uniform behavior on 32-bit and 64-bit and printing out the ICR value in two 32-bit words. Code has changed: text data bss dec hex filename 22901 19650 17040 59591 e8c7 io_apic.o.before 22899 19650 17040 59589 e8c5 io_apic.o.after Due to the 32-bit cast narrowing the printed out value on 64-bit. Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index f7e80262cbb..34c74cf5c24 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -1774,8 +1774,8 @@ __apicdebuginit(void) print_local_APIC(void *dummy) } icr = apic_icr_read(); - printk(KERN_DEBUG "... APIC ICR: %08x\n", icr); - printk(KERN_DEBUG "... APIC ICR2: %08x\n", icr >> 32); + printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr); + printk(KERN_DEBUG "... APIC ICR2: %08x\n", (u32)(icr >> 32)); v = apic_read(APIC_LVTT); printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); -- cgit v1.2.3-70-g09d2 From e89eb43863c2d9f11a3bbe766766fe646e6c50d9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 20 Aug 2008 20:46:25 -0700 Subject: x86: sparse_irq needs spin_lock in allocations Suresh Siddha noticed that we should have a spinlock around it. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 13 ++++++++++++- kernel/irq/handle.c | 13 ++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 34c74cf5c24..7ca55669047 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -146,6 +146,12 @@ static void init_one_irq_cfg(struct irq_cfg *cfg) } static struct irq_cfg *irq_cfgx; + +/* + * Protect the irq_cfgx_free freelist: + */ +static DEFINE_SPINLOCK(irq_cfg_lock); + #ifdef CONFIG_HAVE_SPARSE_IRQ static struct irq_cfg *irq_cfgx_free; #endif @@ -213,8 +219,9 @@ static struct irq_cfg *irq_cfg(unsigned int irq) static struct irq_cfg *irq_cfg_alloc(unsigned int irq) { struct irq_cfg *cfg, *cfg_pri; - int i; + unsigned long flags; int count = 0; + int i; cfg_pri = cfg = irq_cfgx; while (cfg) { @@ -226,6 +233,7 @@ static struct irq_cfg *irq_cfg_alloc(unsigned int irq) count++; } + spin_lock_irqsave(&irq_cfg_lock, flags); if (!irq_cfgx_free) { unsigned long phys; unsigned long total_bytes; @@ -263,6 +271,9 @@ static struct irq_cfg *irq_cfg_alloc(unsigned int irq) else irq_cfgx = cfg; cfg->irq = irq; + + spin_unlock_irqrestore(&irq_cfg_lock, flags); + printk(KERN_DEBUG "found new irq_cfg for irq %d\n", cfg->irq); #ifdef CONFIG_HAVE_SPARSE_IRQ_DEBUG { diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 24c83a3cee4..d638a911cbc 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -107,6 +107,11 @@ static void init_kstat_irqs(struct irq_desc *desc, int nr_desc, int nr) } } +/* + * Protect the sparse_irqs_free freelist: + */ +static DEFINE_SPINLOCK(sparse_irq_lock); + #ifdef CONFIG_HAVE_SPARSE_IRQ static struct irq_desc *sparse_irqs_free; struct irq_desc *sparse_irqs; @@ -166,11 +171,13 @@ struct irq_desc *irq_to_desc(unsigned int irq) } return NULL; } + struct irq_desc *irq_to_desc_alloc(unsigned int irq) { struct irq_desc *desc, *desc_pri; - int i; + unsigned long flags; int count = 0; + int i; desc_pri = desc = sparse_irqs; while (desc) { @@ -182,6 +189,7 @@ struct irq_desc *irq_to_desc_alloc(unsigned int irq) count++; } + spin_lock_irqsave(&sparse_irq_lock, flags); /* * we run out of pre-allocate ones, allocate more */ @@ -223,6 +231,9 @@ struct irq_desc *irq_to_desc_alloc(unsigned int irq) else sparse_irqs = desc; desc->irq = irq; + + spin_unlock_irqrestore(&sparse_irq_lock, flags); + printk(KERN_DEBUG "found new irq_desc for irq %d\n", desc->irq); #ifdef CONFIG_HAVE_SPARSE_IRQ_DEBUG { -- cgit v1.2.3-70-g09d2 From a2d332fa3445160519de03c350a59602ac1c3df9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 21 Aug 2008 12:56:32 -0700 Subject: x86: fix 32-bit ioapic lockup with sparseirqs Missed two lines when copying. Fix panic on one of Ingo's machines that need to adjust ioapic id when acpi off/ 32bit. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 7ca55669047..4e44fd1f466 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -2077,6 +2077,8 @@ static void __init setup_ioapic_ids_from_mpc(void) reg_00.bits.ID = mp_ioapics[apic].mp_apicid; spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0, reg_00.raw); + spin_unlock_irqrestore(&ioapic_lock, flags); /* * Sanity check -- cgit v1.2.3-70-g09d2 From 052c0bff9b83a578654dfa513d6e3d0b3795f1e8 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 21 Aug 2008 13:10:09 -0700 Subject: x86: fix probe_nr_irqs for xen otherwise Xen is _completely_ unusable with 5 or more VCPUs. (when !CONFIG_HAVE_SPARSE_IRQ). based on Alex Nixon's patch. also add +1 offset after redir_entries Signed-off-by: Yinghai Lu Acked-by: Jeremy Fitzhardinge Acked-by: Alex Nixon Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 4e44fd1f466..d28128e0392 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -3625,16 +3625,21 @@ int __init probe_nr_irqs(void) { int idx; int nr = 0; +#ifndef CONFIG_XEN + int nr_min = 32; +#else + int nr_min = NR_IRQS; +#endif for (idx = 0; idx < nr_ioapics; idx++) - nr += io_apic_get_redir_entries(idx); + nr += io_apic_get_redir_entries(idx) + 1; /* double it for hotplug and msi and nmi */ nr <<= 1; /* something wrong ? */ - if (nr < 32) - nr = 32; + if (nr < nr_min) + nr = nr_min; return nr; } -- cgit v1.2.3-70-g09d2 From 2699574b3c04351f5a23c8a842e17c303d7ebb6f Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Thu, 21 Aug 2008 11:26:43 -0700 Subject: x86: VMI, initialize IRQ vector Initialize vector_irq for the vmi used vector, to point to correct irq. Signed-off-by: Alok N Kataria Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmiclock_32.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index 6953859fe28..254ee07f863 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -235,11 +235,14 @@ static void __devinit vmi_time_init_clockevent(void) void __init vmi_time_init(void) { + unsigned int cpu; /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */ outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */ vmi_time_init_clockevent(); setup_irq(0, &vmi_clock_action); + for_each_possible_cpu(cpu) + per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0; } #ifdef CONFIG_X86_LOCAL_APIC -- cgit v1.2.3-70-g09d2 From db4b5525caafd846ec20f95afbc6403c792e22cf Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 24 Aug 2008 02:01:39 -0700 Subject: x86: apic_64.c - setup_APIC_timer has to be __cpuinit function There is no need to hold this code if CPU_HOTPLUG is not defined. Signed-off-by: Cyrill Gorcunov Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index cd860856a0b..8f551276681 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -400,7 +400,7 @@ static void lapic_timer_broadcast(cpumask_t mask) * Setup the local APIC timer for this CPU. Copy the initilized values * of the boot CPU and register the clock event in the framework. */ -static void setup_APIC_timer(void) +static void __cpuinit setup_APIC_timer(void) { struct clock_event_device *levt = &__get_cpu_var(lapic_events); -- cgit v1.2.3-70-g09d2 From 7c37e48b5125fdb0acac846a0a3af42806175d44 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 24 Aug 2008 02:01:40 -0700 Subject: x86: apic - introduce get_physical_broadcast for 64bit We don't really use it now on 64bit mode but could reserve it for future. Signed-off-by: Cyrill Gorcunov Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_64.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 8f551276681..bb46711a0b1 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -244,6 +244,16 @@ void __cpuinit enable_NMI_through_LVT0(void) apic_write(APIC_LVT0, v); } +#ifdef CONFIG_X86_32 +/** + * get_physical_broadcast - Get number of physical broadcast IDs + */ +int get_physical_broadcast(void) +{ + return modern_apic() ? 0xff : 0xf; +} +#endif + /** * lapic_get_maxlvt - get the maximum number of local vector table entries */ -- cgit v1.2.3-70-g09d2 From 920fa7a507c3b1004a9ebe07a2c9d38605b3406a Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 24 Aug 2008 02:01:41 -0700 Subject: x86: apic - unify setup_apicpmtimer Signed-off-by: Cyrill Gorcunov Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 10 ++++++++++ arch/x86/kernel/apic_64.c | 2 ++ 2 files changed, 12 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 21c831d96af..df6ed603e54 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1774,6 +1774,16 @@ static int __init parse_nolapic_timer(char *arg) } early_param("nolapic_timer", parse_nolapic_timer); +#ifdef CONFIG_X86_64 +static __init int setup_apicpmtimer(char *s) +{ + apic_calibrate_pmtmr = 1; + notsc_setup(NULL); + return 0; +} +__setup("apicpmtimer", setup_apicpmtimer); +#endif + static int __init apic_set_verbosity(char *arg) { if (!arg) { diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index bb46711a0b1..ddc5b245faf 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -1810,6 +1810,7 @@ static int __init parse_nolapic_timer(char *arg) } early_param("nolapic_timer", parse_nolapic_timer); +#ifdef CONFIG_X86_64 static __init int setup_apicpmtimer(char *s) { apic_calibrate_pmtmr = 1; @@ -1817,6 +1818,7 @@ static __init int setup_apicpmtimer(char *s) return 0; } __setup("apicpmtimer", setup_apicpmtimer); +#endif static int __init apic_set_verbosity(char *arg) { -- cgit v1.2.3-70-g09d2 From 80e5609cabd7e4321769701a70297f819a15b08d Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 24 Aug 2008 02:01:42 -0700 Subject: x86: apic_64.c - add sanity check for spurious vector definition Do not check for SPUTIOUS_APIC_VECTOR definition twice. Check it once - is what we need. Signed-off-by: Cyrill Gorcunov Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_64.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index ddc5b245faf..4587e16f73e 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -46,6 +46,13 @@ #include #include +/* + * Sanity check + */ +#if ((SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F) +# error SPURIOUS_APIC_VECTOR definition error +#endif + /* Disable local APIC timer from the kernel commandline or via dmi quirk */ static int disable_apic_timer __cpuinitdata; static int apic_calibrate_pmtmr __initdata; @@ -939,8 +946,6 @@ void __cpuinit setup_local_APIC(void) preempt_disable(); value = apic_read(APIC_LVR); - BUILD_BUG_ON((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f); - /* * Double-check whether this APIC is really registered. * This is meaningless in clustered apic mode, so we skip it. -- cgit v1.2.3-70-g09d2 From 89c38c2867ebe37c4c5aee23e7fa1bffb025b171 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 24 Aug 2008 02:01:43 -0700 Subject: x86: apic - unify setup_local_APIC - remove useless read of APIC_LVR - wrap with preempt_disable/enable - check for integrated APIC just in place v2: fix by Yinghai Lu. fix lapic_is_integrated using let 64-bit too have pic_mode Signed-off-by: Cyrill Gorcunov Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 21 ++++++++++++++----- arch/x86/kernel/apic_64.c | 51 ++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 62 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index df6ed603e54..f8c1251984e 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1033,9 +1033,10 @@ static void __cpuinit lapic_setup_esr(void) */ void __cpuinit setup_local_APIC(void) { - unsigned long value, integrated; + unsigned int value; int i, j; +#ifdef CONFIG_X86_32 /* Pound the ESR really hard over the head with a big hammer - mbligh */ if (esr_disable) { apic_write(APIC_ESR, 0); @@ -1043,14 +1044,16 @@ void __cpuinit setup_local_APIC(void) apic_write(APIC_ESR, 0); apic_write(APIC_ESR, 0); } +#endif - integrated = lapic_is_integrated(); + preempt_disable(); /* * Double-check whether this APIC is really registered. + * This is meaningless in clustered apic mode, so we skip it. */ if (!apic_id_registered()) - WARN_ON_ONCE(1); + BUG(); /* * Intel recommends to set DFR, LDR and TPR before enabling @@ -1096,6 +1099,7 @@ void __cpuinit setup_local_APIC(void) */ value |= APIC_SPIV_APIC_ENABLED; +#ifdef CONFIG_X86_32 /* * Some unknown Intel IO/APIC (or APIC) errata is biting us with * certain networking cards. If high frequency interrupts are @@ -1116,8 +1120,13 @@ void __cpuinit setup_local_APIC(void) * See also the comment in end_level_ioapic_irq(). --macro */ - /* Enable focus processor (bit==0) */ + /* + * - enable focus processor (bit==0) + * - 64bit mode always use processor focus + * so no need to set it + */ value &= ~APIC_SPIV_FOCUS_DISABLED; +#endif /* * Set spurious IRQ vector @@ -1154,9 +1163,11 @@ void __cpuinit setup_local_APIC(void) value = APIC_DM_NMI; else value = APIC_DM_NMI | APIC_LVT_MASKED; - if (!integrated) /* 82489DX */ + if (!lapic_is_integrated()) /* 82489DX */ value |= APIC_LVT_LEVEL_TRIGGER; apic_write(APIC_LVT1, value); + + preempt_enable(); } void __cpuinit end_local_APIC_setup(void) diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 4587e16f73e..283968d4e02 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -76,6 +76,8 @@ char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; */ unsigned int apic_verbosity; +int pic_mode; + /* Have we found an MP table */ int smp_found_config; @@ -943,8 +945,17 @@ void __cpuinit setup_local_APIC(void) unsigned int value; int i, j; +#ifdef CONFIG_X86_32 + /* Pound the ESR really hard over the head with a big hammer - mbligh */ + if (esr_disable) { + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + } +#endif + preempt_disable(); - value = apic_read(APIC_LVR); /* * Double-check whether this APIC is really registered. @@ -997,7 +1008,34 @@ void __cpuinit setup_local_APIC(void) */ value |= APIC_SPIV_APIC_ENABLED; - /* We always use processor focus */ +#ifdef CONFIG_X86_32 + /* + * Some unknown Intel IO/APIC (or APIC) errata is biting us with + * certain networking cards. If high frequency interrupts are + * happening on a particular IOAPIC pin, plus the IOAPIC routing + * entry is masked/unmasked at a high rate as well then sooner or + * later IOAPIC line gets 'stuck', no more interrupts are received + * from the device. If focus CPU is disabled then the hang goes + * away, oh well :-( + * + * [ This bug can be reproduced easily with a level-triggered + * PCI Ne2000 networking cards and PII/PIII processors, dual + * BX chipset. ] + */ + /* + * Actually disabling the focus CPU check just makes the hang less + * frequent as it makes the interrupt distributon model be more + * like LRU than MRU (the short-term load is more even across CPUs). + * See also the comment in end_level_ioapic_irq(). --macro + */ + + /* + * - enable focus processor (bit==0) + * - 64bit mode always use processor focus + * so no need to set it + */ + value &= ~APIC_SPIV_FOCUS_DISABLED; +#endif /* * Set spurious IRQ vector @@ -1016,14 +1054,14 @@ void __cpuinit setup_local_APIC(void) * TODO: set up through-local-APIC from through-I/O-APIC? --macro */ value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; - if (!smp_processor_id() && !value) { + if (!smp_processor_id() && (pic_mode || !value)) { value = APIC_DM_EXTINT; apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", - smp_processor_id()); + smp_processor_id()); } else { value = APIC_DM_EXTINT | APIC_LVT_MASKED; apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", - smp_processor_id()); + smp_processor_id()); } apic_write(APIC_LVT0, value); @@ -1034,7 +1072,10 @@ void __cpuinit setup_local_APIC(void) value = APIC_DM_NMI; else value = APIC_DM_NMI | APIC_LVT_MASKED; + if (!lapic_is_integrated()) /* 82489DX */ + value |= APIC_LVT_LEVEL_TRIGGER; apic_write(APIC_LVT1, value); + preempt_enable(); } -- cgit v1.2.3-70-g09d2 From 457cc52d4670bcf1470606a108bbf35aac28eb7f Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 24 Aug 2008 02:01:44 -0700 Subject: x86: apic_32.c should use __cpuinit section All callers are __init or __cpuinit so there is no need to hold this code without CPU_HOTPLUG being set. Signed-off-by: Cyrill Gorcunov Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index f8c1251984e..4ca3717b302 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -383,7 +383,7 @@ static void lapic_timer_broadcast(cpumask_t mask) * Setup the local APIC timer for this CPU. Copy the initilized values * of the boot CPU and register the clock event in the framework. */ -static void __devinit setup_APIC_timer(void) +static void __cpuinit setup_APIC_timer(void) { struct clock_event_device *levt = &__get_cpu_var(lapic_events); @@ -652,7 +652,7 @@ void __init setup_boot_APIC_clock(void) setup_APIC_timer(); } -void __devinit setup_secondary_APIC_clock(void) +void __cpuinit setup_secondary_APIC_clock(void) { setup_APIC_timer(); } @@ -1713,7 +1713,7 @@ static struct sys_device device_lapic = { .cls = &lapic_sysclass, }; -static void __devinit apic_pm_activate(void) +static void __cpuinit apic_pm_activate(void) { apic_pm_state.active = 1; } -- cgit v1.2.3-70-g09d2 From 6460bc73aac970135104a0bc407c2c8b85394d59 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 24 Aug 2008 02:01:45 -0700 Subject: x86: apic - unify smp_apic_timer_interrupt Signed-off-by: Cyrill Gorcunov Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 3 +++ arch/x86/kernel/apic_64.c | 2 ++ 2 files changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 4ca3717b302..913d9924b10 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -718,6 +718,9 @@ void smp_apic_timer_interrupt(struct pt_regs *regs) * Besides, if we don't timer interrupts ignore the global * interrupt lock, which is the WrongThing (tm) to do. */ +#ifdef CONFIG_X86_64 + exit_idle(); +#endif irq_enter(); local_apic_timer_interrupt(); irq_exit(); diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 283968d4e02..2e109119f64 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -625,7 +625,9 @@ void smp_apic_timer_interrupt(struct pt_regs *regs) * Besides, if we don't timer interrupts ignore the global * interrupt lock, which is the WrongThing (tm) to do. */ +#ifdef CONFIG_X86_64 exit_idle(); +#endif irq_enter(); local_apic_timer_interrupt(); irq_exit(); -- cgit v1.2.3-70-g09d2 From b3c5117050e8028d48b2fa0ea09c7a50dd7f3414 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 24 Aug 2008 02:01:46 -0700 Subject: x86: apic_xx.c order variables Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 46 +++++++++++++++++++++++---------------------- arch/x86/kernel/apic_64.c | 48 +++++++++++++++++++++++++++++++---------------- 2 files changed, 56 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 913d9924b10..a54512bea62 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -50,16 +50,37 @@ # error SPURIOUS_APIC_VECTOR definition error #endif -unsigned long mp_lapic_addr; - +#ifdef CONFIG_X86_32 /* * Knob to control our willingness to enable the local APIC. * * +1=force-enable */ static int force_enable_local_apic; -int disable_apic; +/* + * APIC command line parameters + */ +static int __init parse_lapic(char *arg) +{ + force_enable_local_apic = 1; + return 0; +} +early_param("lapic", parse_lapic); +#endif +#ifdef CONFIG_X86_64 +static int apic_calibrate_pmtmr __initdata; +static __init int setup_apicpmtimer(char *s) +{ + apic_calibrate_pmtmr = 1; + notsc_setup(NULL); + return 0; +} +__setup("apicpmtimer", setup_apicpmtimer); +#endif + +unsigned long mp_lapic_addr; +int disable_apic; /* Disable local APIC timer from the kernel commandline or via dmi quirk */ static int disable_apic_timer __cpuinitdata; /* Local APIC timer works in C2 */ @@ -1742,15 +1763,6 @@ static void apic_pm_activate(void) { } #endif /* CONFIG_PM */ -/* - * APIC command line parameters - */ -static int __init parse_lapic(char *arg) -{ - force_enable_local_apic = 1; - return 0; -} -early_param("lapic", parse_lapic); static int __init setup_disableapic(char *arg) { @@ -1788,16 +1800,6 @@ static int __init parse_nolapic_timer(char *arg) } early_param("nolapic_timer", parse_nolapic_timer); -#ifdef CONFIG_X86_64 -static __init int setup_apicpmtimer(char *s) -{ - apic_calibrate_pmtmr = 1; - notsc_setup(NULL); - return 0; -} -__setup("apicpmtimer", setup_apicpmtimer); -#endif - static int __init apic_set_verbosity(char *arg) { if (!arg) { diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 2e109119f64..c40a900e155 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -53,16 +53,44 @@ # error SPURIOUS_APIC_VECTOR definition error #endif -/* Disable local APIC timer from the kernel commandline or via dmi quirk */ -static int disable_apic_timer __cpuinitdata; +#ifdef CONFIG_X86_32 +/* + * Knob to control our willingness to enable the local APIC. + * + * +1=force-enable + */ +static int force_enable_local_apic; +/* + * APIC command line parameters + */ +static int __init parse_lapic(char *arg) +{ + force_enable_local_apic = 1; + return 0; +} +early_param("lapic", parse_lapic); +#endif + +#ifdef CONFIG_X86_64 static int apic_calibrate_pmtmr __initdata; -int disable_apic; +static __init int setup_apicpmtimer(char *s) +{ + apic_calibrate_pmtmr = 1; + notsc_setup(NULL); + return 0; +} +__setup("apicpmtimer", setup_apicpmtimer); +#endif + int disable_x2apic; int x2apic; - /* x2apic enabled before OS handover */ int x2apic_preenabled; +unsigned long mp_lapic_addr; +int disable_apic; +/* Disable local APIC timer from the kernel commandline or via dmi quirk */ +static int disable_apic_timer __cpuinitdata; /* Local APIC timer works in C2 */ int local_apic_timer_c2_ok; EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); @@ -113,8 +141,6 @@ static DEFINE_PER_CPU(struct clock_event_device, lapic_events); static unsigned long apic_phys; -unsigned long mp_lapic_addr; - /* * Get the LAPIC version */ @@ -1858,16 +1884,6 @@ static int __init parse_nolapic_timer(char *arg) } early_param("nolapic_timer", parse_nolapic_timer); -#ifdef CONFIG_X86_64 -static __init int setup_apicpmtimer(char *s) -{ - apic_calibrate_pmtmr = 1; - notsc_setup(NULL); - return 0; -} -__setup("apicpmtimer", setup_apicpmtimer); -#endif - static int __init apic_set_verbosity(char *arg) { if (!arg) { -- cgit v1.2.3-70-g09d2 From 49899eacce79ce39faf531dad3e00f771eba2eb1 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 24 Aug 2008 02:01:47 -0700 Subject: x86: use HAVE_X2APIC in apic_64.c Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_64.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index c40a900e155..d3ec746aede 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -82,10 +82,23 @@ static __init int setup_apicpmtimer(char *s) __setup("apicpmtimer", setup_apicpmtimer); #endif -int disable_x2apic; +#ifdef CONFIG_X86_64 +#define HAVE_X2APIC +#endif + +#ifdef HAVE_X2APIC int x2apic; /* x2apic enabled before OS handover */ int x2apic_preenabled; +int disable_x2apic; +static __init int setup_nox2apic(char *str) +{ + disable_x2apic = 1; + setup_clear_cpu_cap(X86_FEATURE_X2APIC); + return 0; +} +early_param("nox2apic", setup_nox2apic); +#endif unsigned long mp_lapic_addr; int disable_apic; @@ -228,6 +241,7 @@ static struct apic_ops xapic_ops = { struct apic_ops __read_mostly *apic_ops = &xapic_ops; EXPORT_SYMBOL_GPL(apic_ops); +#ifdef HAVE_X2APIC static void x2apic_wait_icr_idle(void) { /* no need to wait for icr idle in x2apic */ @@ -261,6 +275,7 @@ static struct apic_ops x2apic_ops = { .wait_icr_idle = x2apic_wait_icr_idle, .safe_wait_icr_idle = safe_x2apic_wait_icr_idle, }; +#endif /** * enable_NMI_through_LVT0 - enable NMI through local vector table 0 @@ -1125,6 +1140,7 @@ void __cpuinit end_local_APIC_setup(void) apic_pm_activate(); } +#ifdef HAVE_X2APIC void check_x2apic(void) { int msr, msr2; @@ -1243,6 +1259,7 @@ end: return; } +#endif /* HAVE_X2APIC */ /* * Detect and enable local APICs on non-SMP boards. @@ -1291,10 +1308,12 @@ void __init early_init_lapic_mapping(void) */ void __init init_apic_mappings(void) { +#ifdef HAVE_X2APIC if (x2apic) { boot_cpu_physical_apicid = read_apic_id(); return; } +#endif /* * If no local APIC can be found then set up a fake all @@ -1335,8 +1354,9 @@ int __init APIC_init_uniprocessor(void) printk(KERN_INFO "Apic disabled by BIOS\n"); return -1; } - +#ifdef HAVE_X2APIC enable_IR_x2apic(); +#endif setup_apic_routing(); verify_local_APIC(); @@ -1672,7 +1692,7 @@ static int lapic_resume(struct sys_device *dev) local_irq_save(flags); -#ifdef CONFIG_X86_64 +#ifdef HAVE_X2APIC if (x2apic) enable_x2apic(); else @@ -1836,15 +1856,6 @@ __cpuinit int apic_is_clustered_box(void) return (clusters > 2); } -static __init int setup_nox2apic(char *str) -{ - disable_x2apic = 1; - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_X2APIC); - return 0; -} -early_param("nox2apic", setup_nox2apic); - - /* * APIC command line parameters */ -- cgit v1.2.3-70-g09d2 From 3491998dd54f6d4ef7344518fe5463b299fdf537 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 24 Aug 2008 02:01:48 -0700 Subject: x86: add hard_smp_prossor_id with MACRO in io_apic_xx.c Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 7 +++++++ arch/x86/kernel/apic_64.c | 2 ++ 2 files changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index a54512bea62..93718ffb9b9 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1600,6 +1600,13 @@ void __cpuinit generic_processor_info(int apicid, int version) cpu_set(cpu, cpu_present_map); } +#ifdef CONFIG_X86_64 +int hard_smp_processor_id(void) +{ + return read_apic_id(); +} +#endif + /* * Power management */ diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index d3ec746aede..eeb69838c2f 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -1612,10 +1612,12 @@ void __cpuinit generic_processor_info(int apicid, int version) cpu_set(cpu, cpu_present_map); } +#ifdef CONFIG_X86_64 int hard_smp_processor_id(void) { return read_apic_id(); } +#endif /* * Power management -- cgit v1.2.3-70-g09d2 From f28c0ae21d80ffd6eb0987901c5273843387e341 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 24 Aug 2008 02:01:49 -0700 Subject: x86: make apic_32/64.c more like except x2apic, detec_init_APIC, and calibrating_APIC_clock Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 125 +++++++++++++++++++++++++++++++++++++++++----- arch/x86/kernel/apic_64.c | 10 +++- 2 files changed, 122 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 93718ffb9b9..bfac26a1609 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -66,6 +66,9 @@ static int __init parse_lapic(char *arg) return 0; } early_param("lapic", parse_lapic); +/* Local APIC was disabled by the BIOS and enabled by the kernel */ +static int enabled_via_apicbase; + #endif #ifdef CONFIG_X86_64 @@ -131,9 +134,6 @@ static struct clock_event_device lapic_clockevent = { }; static DEFINE_PER_CPU(struct clock_event_device, lapic_events); -/* Local APIC was disabled by the BIOS and enabled by the kernel */ -static int enabled_via_apicbase; - static unsigned long apic_phys; /* @@ -240,6 +240,7 @@ void __cpuinit enable_NMI_through_LVT0(void) apic_write(APIC_LVT0, v); } +#ifdef CONFIG_X86_32 /** * get_physical_broadcast - Get number of physical broadcast IDs */ @@ -247,6 +248,7 @@ int get_physical_broadcast(void) { return modern_apic() ? 0xff : 0xf; } +#endif /** * lapic_get_maxlvt - get the maximum number of local vector table entries @@ -1291,6 +1293,32 @@ no_apic: return -1; } +#ifdef CONFIG_X86_64 +void __init early_init_lapic_mapping(void) +{ + unsigned long phys_addr; + + /* + * If no local APIC can be found then go out + * : it means there is no mpatable and MADT + */ + if (!smp_found_config) + return; + + phys_addr = mp_lapic_addr; + + set_fixmap_nocache(FIX_APIC_BASE, phys_addr); + apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", + APIC_BASE, phys_addr); + + /* + * Fetch the APIC ID of the BSP in case we have a + * default configuration (or the MP table is broken). + */ + boot_cpu_physical_apicid = read_apic_id(); +} +#endif + /** * init_apic_mappings - initialize APIC mappings */ @@ -1308,8 +1336,8 @@ void __init init_apic_mappings(void) apic_phys = mp_lapic_addr; set_fixmap_nocache(FIX_APIC_BASE, apic_phys); - printk(KERN_DEBUG "mapped APIC to %08lx (%08lx)\n", APIC_BASE, - apic_phys); + apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", + APIC_BASE, apic_phys); /* * Fetch the APIC ID of the BSP in case we have a @@ -1317,14 +1345,12 @@ void __init init_apic_mappings(void) */ if (boot_cpu_physical_apicid == -1U) boot_cpu_physical_apicid = read_apic_id(); - } /* * This initializes the IO-APIC and APIC hardware if this is * a UP kernel. */ - int apic_version[MAX_APICS]; int __init APIC_init_uniprocessor(void) @@ -1682,11 +1708,6 @@ static int lapic_resume(struct sys_device *dev) local_irq_save(flags); -#ifdef CONFIG_X86_64 - if (x2apic) - enable_x2apic(); - else -#endif { /* * Make sure the APICBASE points to the right address @@ -1770,7 +1791,87 @@ static void apic_pm_activate(void) { } #endif /* CONFIG_PM */ +#ifdef CONFIG_X86_64 +/* + * apic_is_clustered_box() -- Check if we can expect good TSC + * + * Thus far, the major user of this is IBM's Summit2 series: + * + * Clustered boxes may have unsynced TSC problems if they are + * multi-chassis. Use available data to take a good guess. + * If in doubt, go HPET. + */ +__cpuinit int apic_is_clustered_box(void) +{ + int i, clusters, zeros; + unsigned id; + u16 *bios_cpu_apicid; + DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); + + /* + * there is not this kind of box with AMD CPU yet. + * Some AMD box with quadcore cpu and 8 sockets apicid + * will be [4, 0x23] or [8, 0x27] could be thought to + * vsmp box still need checking... + */ + if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box()) + return 0; + + bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); + bitmap_zero(clustermap, NUM_APIC_CLUSTERS); + + for (i = 0; i < NR_CPUS; i++) { + /* are we being called early in kernel startup? */ + if (bios_cpu_apicid) { + id = bios_cpu_apicid[i]; + } + else if (i < nr_cpu_ids) { + if (cpu_present(i)) + id = per_cpu(x86_bios_cpu_apicid, i); + else + continue; + } + else + break; + if (id != BAD_APICID) + __set_bit(APIC_CLUSTERID(id), clustermap); + } + + /* Problem: Partially populated chassis may not have CPUs in some of + * the APIC clusters they have been allocated. Only present CPUs have + * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap. + * Since clusters are allocated sequentially, count zeros only if + * they are bounded by ones. + */ + clusters = 0; + zeros = 0; + for (i = 0; i < NUM_APIC_CLUSTERS; i++) { + if (test_bit(i, clustermap)) { + clusters += 1 + zeros; + zeros = 0; + } else + ++zeros; + } + + /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are + * not guaranteed to be synced between boards + */ + if (is_vsmp_box() && clusters > 1) + return 1; + + /* + * If clusters > 2, then should be multi-chassis. + * May have to revisit this when multi-core + hyperthreaded CPUs come + * out, but AFAIK this will work even for them. + */ + return (clusters > 2); +} +#endif + +/* + * APIC command line parameters + */ static int __init setup_disableapic(char *arg) { disable_apic = 1; diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index eeb69838c2f..dca6c246e73 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -69,6 +69,9 @@ static int __init parse_lapic(char *arg) return 0; } early_param("lapic", parse_lapic); +/* Local APIC was disabled by the BIOS and enabled by the kernel */ +static int enabled_via_apicbase; + #endif #ifdef CONFIG_X86_64 @@ -1279,6 +1282,7 @@ static int __init detect_init_APIC(void) return 0; } +#ifdef CONFIG_X86_64 void __init early_init_lapic_mapping(void) { unsigned long phys_addr; @@ -1302,6 +1306,7 @@ void __init early_init_lapic_mapping(void) */ boot_cpu_physical_apicid = read_apic_id(); } +#endif /** * init_apic_mappings - initialize APIC mappings @@ -1334,7 +1339,8 @@ void __init init_apic_mappings(void) * Fetch the APIC ID of the BSP in case we have a * default configuration (or the MP table is broken). */ - boot_cpu_physical_apicid = read_apic_id(); + if (boot_cpu_physical_apicid == -1U) + boot_cpu_physical_apicid = read_apic_id(); } /* @@ -1782,6 +1788,7 @@ static void apic_pm_activate(void) { } #endif /* CONFIG_PM */ +#ifdef CONFIG_X86_64 /* * apic_is_clustered_box() -- Check if we can expect good TSC * @@ -1857,6 +1864,7 @@ __cpuinit int apic_is_clustered_box(void) */ return (clusters > 2); } +#endif /* * APIC command line parameters -- cgit v1.2.3-70-g09d2 From fa2bd35a8d5c88c03b638c72daf7f38a132d0e8c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 24 Aug 2008 02:01:50 -0700 Subject: x86: merge APIC_init_uniprocessor Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 51 +++++++++++++++++++++++++++++++++++++++++------ arch/x86/kernel/apic_64.c | 48 +++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 90 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index bfac26a1609..8f8b0e1f3eb 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1355,6 +1355,17 @@ int apic_version[MAX_APICS]; int __init APIC_init_uniprocessor(void) { +#ifdef CONFIG_X86_64 + if (disable_apic) { + printk(KERN_INFO "Apic disabled\n"); + return -1; + } + if (!cpu_has_apic) { + disable_apic = 1; + printk(KERN_INFO "Apic disabled by BIOS\n"); + return -1; + } +#else if (!smp_found_config && !cpu_has_apic) return -1; @@ -1368,34 +1379,62 @@ int __init APIC_init_uniprocessor(void) clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); return -1; } +#endif +#ifdef HAVE_X2APIC + enable_IR_x2apic(); +#endif +#ifdef CONFIG_X86_64 + setup_apic_routing(); +#endif verify_local_APIC(); - connect_bsp_APIC(); +#ifdef CONFIG_X86_64 + apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid)); +#else /* * Hack: In case of kdump, after a crash, kernel might be booting * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid * might be zero if read from MP tables. Get it from LAPIC. */ -#ifdef CONFIG_CRASH_DUMP +# ifdef CONFIG_CRASH_DUMP boot_cpu_physical_apicid = read_apic_id(); +# endif #endif physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); - setup_local_APIC(); +#ifdef CONFIG_X86_64 + /* + * Now enable IO-APICs, actually call clear_IO_APIC + * We need clear_IO_APIC before enabling vector on BP + */ + if (!skip_ioapic_setup && nr_ioapics) + enable_IO_APIC(); +#endif + #ifdef CONFIG_X86_IO_APIC if (!smp_found_config || skip_ioapic_setup || !nr_ioapics) #endif localise_nmi_watchdog(); end_local_APIC_setup(); + #ifdef CONFIG_X86_IO_APIC - if (smp_found_config) - if (!skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); + if (smp_found_config && !skip_ioapic_setup && nr_ioapics) + setup_IO_APIC(); +# ifdef CONFIG_X86_64 + else + nr_ioapics = 0; +# endif #endif + +#ifdef CONFIG_X86_64 + setup_boot_APIC_clock(); + check_nmi_watchdog(); +#else setup_boot_clock(); +#endif return 0; } diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index dca6c246e73..16a30c22b07 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -1351,6 +1351,7 @@ int apic_version[MAX_APICS]; int __init APIC_init_uniprocessor(void) { +#ifdef CONFIG_X86_64 if (disable_apic) { printk(KERN_INFO "Apic disabled\n"); return -1; @@ -1360,37 +1361,78 @@ int __init APIC_init_uniprocessor(void) printk(KERN_INFO "Apic disabled by BIOS\n"); return -1; } +#else + if (!smp_found_config && !cpu_has_apic) + return -1; + + /* + * Complain if the BIOS pretends there is one. + */ + if (!cpu_has_apic && + APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", + boot_cpu_physical_apicid); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + return -1; + } +#endif + #ifdef HAVE_X2APIC enable_IR_x2apic(); #endif +#ifdef CONFIG_X86_64 setup_apic_routing(); +#endif verify_local_APIC(); - connect_bsp_APIC(); - physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); +#ifdef CONFIG_X86_64 apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid)); - +#else + /* + * Hack: In case of kdump, after a crash, kernel might be booting + * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid + * might be zero if read from MP tables. Get it from LAPIC. + */ +# ifdef CONFIG_CRASH_DUMP + boot_cpu_physical_apicid = read_apic_id(); +# endif +#endif + physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); setup_local_APIC(); +#ifdef CONFIG_X86_64 /* * Now enable IO-APICs, actually call clear_IO_APIC * We need clear_IO_APIC before enabling vector on BP */ if (!skip_ioapic_setup && nr_ioapics) enable_IO_APIC(); +#endif +#ifdef CONFIG_X86_IO_APIC if (!smp_found_config || skip_ioapic_setup || !nr_ioapics) +#endif localise_nmi_watchdog(); end_local_APIC_setup(); +#ifdef CONFIG_X86_IO_APIC if (smp_found_config && !skip_ioapic_setup && nr_ioapics) setup_IO_APIC(); +# ifdef CONFIG_X86_64 else nr_ioapics = 0; +# endif +#endif + +#ifdef CONFIG_X86_64 setup_boot_APIC_clock(); check_nmi_watchdog(); +#else + setup_boot_clock(); +#endif + return 0; } -- cgit v1.2.3-70-g09d2 From be7a656fe131cba088912bcafb079b029320504d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 24 Aug 2008 02:01:51 -0700 Subject: x86: copy detect_init_APIC to the other Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 20 ++++++++++++ arch/x86/kernel/apic_64.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 101 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 8f8b0e1f3eb..1103e05aa1b 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1214,6 +1214,25 @@ void __cpuinit end_local_APIC_setup(void) apic_pm_activate(); } +#ifdef CONFIG_X86_64 +/* + * Detect and enable local APICs on non-SMP boards. + * Original code written by Keir Fraser. + * On AMD64 we trust the BIOS - if it says no APIC it is likely + * not correctly set up (usually the APIC timer won't work etc.) + */ +static int __init detect_init_APIC(void) +{ + if (!cpu_has_apic) { + printk(KERN_INFO "No local APIC present\n"); + return -1; + } + + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + boot_cpu_physical_apicid = 0; + return 0; +} +#else /* * Detect and initialize APIC */ @@ -1292,6 +1311,7 @@ no_apic: printk(KERN_INFO "No local APIC present or hardware disabled\n"); return -1; } +#endif #ifdef CONFIG_X86_64 void __init early_init_lapic_mapping(void) diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 16a30c22b07..b7268f5c8b1 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -684,7 +684,6 @@ int setup_profiling_timer(unsigned int multiplier) return -EINVAL; } - /* * Local APIC start and shutdown */ @@ -1264,6 +1263,7 @@ end: } #endif /* HAVE_X2APIC */ +#ifdef CONFIG_X86_64 /* * Detect and enable local APICs on non-SMP boards. * Original code written by Keir Fraser. @@ -1281,6 +1281,86 @@ static int __init detect_init_APIC(void) boot_cpu_physical_apicid = 0; return 0; } +#else +/* + * Detect and initialize APIC + */ +static int __init detect_init_APIC(void) +{ + u32 h, l, features; + + /* Disabled by kernel option? */ + if (disable_apic) + return -1; + + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) || + (boot_cpu_data.x86 == 15)) + break; + goto no_apic; + case X86_VENDOR_INTEL: + if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 || + (boot_cpu_data.x86 == 5 && cpu_has_apic)) + break; + goto no_apic; + default: + goto no_apic; + } + + if (!cpu_has_apic) { + /* + * Over-ride BIOS and try to enable the local APIC only if + * "lapic" specified. + */ + if (!force_enable_local_apic) { + printk(KERN_INFO "Local APIC disabled by BIOS -- " + "you can enable it with \"lapic\"\n"); + return -1; + } + /* + * Some BIOSes disable the local APIC in the APIC_BASE + * MSR. This can only be done in software for Intel P6 or later + * and AMD K7 (Model > 1) or later. + */ + rdmsr(MSR_IA32_APICBASE, l, h); + if (!(l & MSR_IA32_APICBASE_ENABLE)) { + printk(KERN_INFO + "Local APIC disabled by BIOS -- reenabling.\n"); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; + wrmsr(MSR_IA32_APICBASE, l, h); + enabled_via_apicbase = 1; + } + } + /* + * The APIC feature bit should now be enabled + * in `cpuid' + */ + features = cpuid_edx(1); + if (!(features & (1 << X86_FEATURE_APIC))) { + printk(KERN_WARNING "Could not enable APIC!\n"); + return -1; + } + set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + + /* The BIOS may have set up the APIC at some other address */ + rdmsr(MSR_IA32_APICBASE, l, h); + if (l & MSR_IA32_APICBASE_ENABLE) + mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; + + printk(KERN_INFO "Found and enabled local APIC!\n"); + + apic_pm_activate(); + + return 0; + +no_apic: + printk(KERN_INFO "No local APIC present or hardware disabled\n"); + return -1; +} +#endif #ifdef CONFIG_X86_64 void __init early_init_lapic_mapping(void) -- cgit v1.2.3-70-g09d2 From 773763df7de881e65ff2600c024c9ce2dde64750 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 24 Aug 2008 02:01:52 -0700 Subject: x86: merge header files in apic_xx.c Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 8 ++++++++ arch/x86/kernel/apic_64.c | 7 ++++++- 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 1103e05aa1b..0ec8321e687 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -23,11 +23,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include @@ -36,8 +38,14 @@ #include #include #include +#include #include #include +#include +#include +#include +#include +#include #include #include diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index b7268f5c8b1..ebe417b4d7f 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -24,9 +24,11 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -34,8 +36,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -43,8 +47,9 @@ #include #include -#include #include +#include +#include /* * Sanity check -- cgit v1.2.3-70-g09d2 From dc1528dd864a0b79fa67b60b3ca5674fe94fdce5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 24 Aug 2008 02:01:53 -0700 Subject: x86: apic unify smp_spurious/error_interrupt Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 24 +++++++++++++++++++++--- arch/x86/kernel/apic_64.c | 24 ++++++++++++++++++++++-- 2 files changed, 43 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 0ec8321e687..60a901b2d4f 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1474,10 +1474,17 @@ int __init APIC_init_uniprocessor(void) /* * This interrupt should _never_ happen with our APIC/SMP architecture */ +#ifdef CONFIG_X86_64 +asmlinkage void smp_spurious_interrupt(void) +#else void smp_spurious_interrupt(struct pt_regs *regs) +#endif { - unsigned long v; + u32 v; +#ifdef CONFIG_X86_64 + exit_idle(); +#endif irq_enter(); /* * Check if this really is a spurious interrupt and ACK it @@ -1488,20 +1495,31 @@ void smp_spurious_interrupt(struct pt_regs *regs) if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) ack_APIC_irq(); +#ifdef CONFIG_X86_64 + add_pda(irq_spurious_count, 1); +#else /* see sw-dev-man vol 3, chapter 7.4.13.5 */ printk(KERN_INFO "spurious APIC interrupt on CPU#%d, " "should never happen.\n", smp_processor_id()); __get_cpu_var(irq_stat).irq_spurious_count++; +#endif irq_exit(); } /* * This interrupt should never happen with our APIC/SMP architecture */ +#ifdef CONFIG_X86_64 +asmlinkage void smp_error_interrupt(void) +#else void smp_error_interrupt(struct pt_regs *regs) +#endif { - unsigned long v, v1; + u32 v, v1; +#ifdef CONFIG_X86_64 + exit_idle(); +#endif irq_enter(); /* First tickle the hardware, only then report what went on. -- REW */ v = apic_read(APIC_ESR); @@ -1520,7 +1538,7 @@ void smp_error_interrupt(struct pt_regs *regs) 6: Received illegal vector 7: Illegal register address */ - printk(KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", + printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", smp_processor_id(), v , v1); irq_exit(); } diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index ebe417b4d7f..c728885e4f4 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -1528,10 +1528,17 @@ int __init APIC_init_uniprocessor(void) /* * This interrupt should _never_ happen with our APIC/SMP architecture */ +#ifdef CONFIG_X86_64 asmlinkage void smp_spurious_interrupt(void) +#else +void smp_spurious_interrupt(struct pt_regs *regs) +#endif { - unsigned int v; + u32 v; + +#ifdef CONFIG_X86_64 exit_idle(); +#endif irq_enter(); /* * Check if this really is a spurious interrupt and ACK it @@ -1542,18 +1549,31 @@ asmlinkage void smp_spurious_interrupt(void) if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) ack_APIC_irq(); +#ifdef CONFIG_X86_64 add_pda(irq_spurious_count, 1); +#else + /* see sw-dev-man vol 3, chapter 7.4.13.5 */ + printk(KERN_INFO "spurious APIC interrupt on CPU#%d, " + "should never happen.\n", smp_processor_id()); + __get_cpu_var(irq_stat).irq_spurious_count++; +#endif irq_exit(); } /* * This interrupt should never happen with our APIC/SMP architecture */ +#ifdef CONFIG_X86_64 asmlinkage void smp_error_interrupt(void) +#else +void smp_error_interrupt(struct pt_regs *regs) +#endif { - unsigned int v, v1; + u32 v, v1; +#ifdef CONFIG_X86_64 exit_idle(); +#endif irq_enter(); /* First tickle the hardware, only then report what went on. -- REW */ v = apic_read(APIC_ESR); -- cgit v1.2.3-70-g09d2 From 2f04fa888d270951b9e0fe9e641ddd560d77ad1b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 24 Aug 2008 02:01:54 -0700 Subject: x86: apic copy calibrate_APIC_clock to each other in apic_32/64.c Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 86 +++++++++++++++++++ arch/x86/kernel/apic_64.c | 215 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 301 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 60a901b2d4f..05498c37f8d 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -424,6 +424,90 @@ static void __cpuinit setup_APIC_timer(void) clockevents_register_device(levt); } +#ifdef CONFIG_X86_64 +/* + * In this function we calibrate APIC bus clocks to the external + * timer. Unfortunately we cannot use jiffies and the timer irq + * to calibrate, since some later bootup code depends on getting + * the first irq? Ugh. + * + * We want to do the calibration only once since we + * want to have local timer irqs syncron. CPUs connected + * by the same APIC bus have the very same bus frequency. + * And we want to have irqs off anyways, no accidental + * APIC irq that way. + */ + +#define TICK_COUNT 100000000 + +static int __init calibrate_APIC_clock(void) +{ + unsigned apic, apic_start; + unsigned long tsc, tsc_start; + int result; + + local_irq_disable(); + + /* + * Put whatever arbitrary (but long enough) timeout + * value into the APIC clock, we just want to get the + * counter running for calibration. + * + * No interrupt enable ! + */ + __setup_APIC_LVTT(250000000, 0, 0); + + apic_start = apic_read(APIC_TMCCT); +#ifdef CONFIG_X86_PM_TIMER + if (apic_calibrate_pmtmr && pmtmr_ioport) { + pmtimer_wait(5000); /* 5ms wait */ + apic = apic_read(APIC_TMCCT); + result = (apic_start - apic) * 1000L / 5; + } else +#endif + { + rdtscll(tsc_start); + + do { + apic = apic_read(APIC_TMCCT); + rdtscll(tsc); + } while ((tsc - tsc_start) < TICK_COUNT && + (apic_start - apic) < TICK_COUNT); + + result = (apic_start - apic) * 1000L * tsc_khz / + (tsc - tsc_start); + } + + local_irq_enable(); + + printk(KERN_DEBUG "APIC timer calibration result %d\n", result); + + printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", + result / 1000 / 1000, result / 1000 % 1000); + + /* Calculate the scaled math multiplication factor */ + lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC, + lapic_clockevent.shift); + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + + calibration_result = (result * APIC_DIVISOR) / HZ; + + /* + * Do a sanity check on the APIC calibration result + */ + if (calibration_result < (1000000 / HZ)) { + printk(KERN_WARNING + "APIC frequency too slow, disabling apic timer\n"); + return -1; + } + + return 0; +} + +#else /* * In this functions we calibrate APIC bus clocks to the external timer. * @@ -635,6 +719,8 @@ static int __init calibrate_APIC_clock(void) return 0; } +#endif + /* * Setup the boot APIC * diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index c728885e4f4..6e99af5ce67 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -478,6 +478,7 @@ static void __cpuinit setup_APIC_timer(void) clockevents_register_device(levt); } +#ifdef CONFIG_X86_64 /* * In this function we calibrate APIC bus clocks to the external * timer. Unfortunately we cannot use jiffies and the timer irq @@ -560,6 +561,220 @@ static int __init calibrate_APIC_clock(void) return 0; } +#else +/* + * In this functions we calibrate APIC bus clocks to the external timer. + * + * We want to do the calibration only once since we want to have local timer + * irqs syncron. CPUs connected by the same APIC bus have the very same bus + * frequency. + * + * This was previously done by reading the PIT/HPET and waiting for a wrap + * around to find out, that a tick has elapsed. I have a box, where the PIT + * readout is broken, so it never gets out of the wait loop again. This was + * also reported by others. + * + * Monitoring the jiffies value is inaccurate and the clockevents + * infrastructure allows us to do a simple substitution of the interrupt + * handler. + * + * The calibration routine also uses the pm_timer when possible, as the PIT + * happens to run way too slow (factor 2.3 on my VAIO CoreDuo, which goes + * back to normal later in the boot process). + */ + +#define LAPIC_CAL_LOOPS (HZ/10) + +static __initdata int lapic_cal_loops = -1; +static __initdata long lapic_cal_t1, lapic_cal_t2; +static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2; +static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2; +static __initdata unsigned long lapic_cal_j1, lapic_cal_j2; + +/* + * Temporary interrupt handler. + */ +static void __init lapic_cal_handler(struct clock_event_device *dev) +{ + unsigned long long tsc = 0; + long tapic = apic_read(APIC_TMCCT); + unsigned long pm = acpi_pm_read_early(); + + if (cpu_has_tsc) + rdtscll(tsc); + + switch (lapic_cal_loops++) { + case 0: + lapic_cal_t1 = tapic; + lapic_cal_tsc1 = tsc; + lapic_cal_pm1 = pm; + lapic_cal_j1 = jiffies; + break; + + case LAPIC_CAL_LOOPS: + lapic_cal_t2 = tapic; + lapic_cal_tsc2 = tsc; + if (pm < lapic_cal_pm1) + pm += ACPI_PM_OVRRUN; + lapic_cal_pm2 = pm; + lapic_cal_j2 = jiffies; + break; + } +} + +static int __init calibrate_APIC_clock(void) +{ + struct clock_event_device *levt = &__get_cpu_var(lapic_events); + const long pm_100ms = PMTMR_TICKS_PER_SEC/10; + const long pm_thresh = pm_100ms/100; + void (*real_handler)(struct clock_event_device *dev); + unsigned long deltaj; + long delta, deltapm; + int pm_referenced = 0; + + local_irq_disable(); + + /* Replace the global interrupt handler */ + real_handler = global_clock_event->event_handler; + global_clock_event->event_handler = lapic_cal_handler; + + /* + * Setup the APIC counter to 1e9. There is no way the lapic + * can underflow in the 100ms detection time frame + */ + __setup_APIC_LVTT(1000000000, 0, 0); + + /* Let the interrupts run */ + local_irq_enable(); + + while (lapic_cal_loops <= LAPIC_CAL_LOOPS) + cpu_relax(); + + local_irq_disable(); + + /* Restore the real event handler */ + global_clock_event->event_handler = real_handler; + + /* Build delta t1-t2 as apic timer counts down */ + delta = lapic_cal_t1 - lapic_cal_t2; + apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); + + /* Check, if the PM timer is available */ + deltapm = lapic_cal_pm2 - lapic_cal_pm1; + apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); + + if (deltapm) { + unsigned long mult; + u64 res; + + mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22); + + if (deltapm > (pm_100ms - pm_thresh) && + deltapm < (pm_100ms + pm_thresh)) { + apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); + } else { + res = (((u64) deltapm) * mult) >> 22; + do_div(res, 1000000); + printk(KERN_WARNING "APIC calibration not consistent " + "with PM Timer: %ldms instead of 100ms\n", + (long)res); + /* Correct the lapic counter value */ + res = (((u64) delta) * pm_100ms); + do_div(res, deltapm); + printk(KERN_INFO "APIC delta adjusted to PM-Timer: " + "%lu (%ld)\n", (unsigned long) res, delta); + delta = (long) res; + } + pm_referenced = 1; + } + + /* Calculate the scaled math multiplication factor */ + lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, + lapic_clockevent.shift); + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + + calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; + + apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); + apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult); + apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", + calibration_result); + + if (cpu_has_tsc) { + delta = (long)(lapic_cal_tsc2 - lapic_cal_tsc1); + apic_printk(APIC_VERBOSE, "..... CPU clock speed is " + "%ld.%04ld MHz.\n", + (delta / LAPIC_CAL_LOOPS) / (1000000 / HZ), + (delta / LAPIC_CAL_LOOPS) % (1000000 / HZ)); + } + + apic_printk(APIC_VERBOSE, "..... host bus clock speed is " + "%u.%04u MHz.\n", + calibration_result / (1000000 / HZ), + calibration_result % (1000000 / HZ)); + + /* + * Do a sanity check on the APIC calibration result + */ + if (calibration_result < (1000000 / HZ)) { + local_irq_enable(); + printk(KERN_WARNING + "APIC frequency too slow, disabling apic timer\n"); + return -1; + } + + levt->features &= ~CLOCK_EVT_FEAT_DUMMY; + + /* We trust the pm timer based calibration */ + if (!pm_referenced) { + apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); + + /* + * Setup the apic timer manually + */ + levt->event_handler = lapic_cal_handler; + lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt); + lapic_cal_loops = -1; + + /* Let the interrupts run */ + local_irq_enable(); + + while (lapic_cal_loops <= LAPIC_CAL_LOOPS) + cpu_relax(); + + local_irq_disable(); + + /* Stop the lapic timer */ + lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt); + + local_irq_enable(); + + /* Jiffies delta */ + deltaj = lapic_cal_j2 - lapic_cal_j1; + apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj); + + /* Check, if the jiffies result is consistent */ + if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) + apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); + else + levt->features |= CLOCK_EVT_FEAT_DUMMY; + } else + local_irq_enable(); + + if (levt->features & CLOCK_EVT_FEAT_DUMMY) { + printk(KERN_WARNING + "APIC timer disabled due to verification failure.\n"); + return -1; + } + + return 0; +} + +#endif + /* * Setup the boot APIC * -- cgit v1.2.3-70-g09d2 From 6c15822752a13748241e1a34068f2b15b660d28e Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 24 Aug 2008 02:01:55 -0700 Subject: x86: apic copy apic_64.c to apic_32.c Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 188 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 188 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 05498c37f8d..0b11d6e3f09 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -90,6 +90,24 @@ static __init int setup_apicpmtimer(char *s) __setup("apicpmtimer", setup_apicpmtimer); #endif +#ifdef CONFIG_X86_64 +#define HAVE_X2APIC +#endif + +#ifdef HAVE_X2APIC +int x2apic; +/* x2apic enabled before OS handover */ +int x2apic_preenabled; +int disable_x2apic; +static __init int setup_nox2apic(char *str) +{ + disable_x2apic = 1; + setup_clear_cpu_cap(X86_FEATURE_X2APIC); + return 0; +} +early_param("nox2apic", setup_nox2apic); +#endif + unsigned long mp_lapic_addr; int disable_apic; /* Disable local APIC timer from the kernel commandline or via dmi quirk */ @@ -231,6 +249,42 @@ static struct apic_ops xapic_ops = { struct apic_ops __read_mostly *apic_ops = &xapic_ops; EXPORT_SYMBOL_GPL(apic_ops); +#ifdef HAVE_X2APIC +static void x2apic_wait_icr_idle(void) +{ + /* no need to wait for icr idle in x2apic */ + return; +} + +static u32 safe_x2apic_wait_icr_idle(void) +{ + /* no need to wait for icr idle in x2apic */ + return 0; +} + +void x2apic_icr_write(u32 low, u32 id) +{ + wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low); +} + +u64 x2apic_icr_read(void) +{ + unsigned long val; + + rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val); + return val; +} + +static struct apic_ops x2apic_ops = { + .read = native_apic_msr_read, + .write = native_apic_msr_write, + .icr_read = x2apic_icr_read, + .icr_write = x2apic_icr_write, + .wait_icr_idle = x2apic_wait_icr_idle, + .safe_wait_icr_idle = safe_x2apic_wait_icr_idle, +}; +#endif + /** * enable_NMI_through_LVT0 - enable NMI through local vector table 0 */ @@ -1308,6 +1362,127 @@ void __cpuinit end_local_APIC_setup(void) apic_pm_activate(); } +#ifdef HAVE_X2APIC +void check_x2apic(void) +{ + int msr, msr2; + + rdmsr(MSR_IA32_APICBASE, msr, msr2); + + if (msr & X2APIC_ENABLE) { + printk("x2apic enabled by BIOS, switching to x2apic ops\n"); + x2apic_preenabled = x2apic = 1; + apic_ops = &x2apic_ops; + } +} + +void enable_x2apic(void) +{ + int msr, msr2; + + rdmsr(MSR_IA32_APICBASE, msr, msr2); + if (!(msr & X2APIC_ENABLE)) { + printk("Enabling x2apic\n"); + wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); + } +} + +void enable_IR_x2apic(void) +{ +#ifdef CONFIG_INTR_REMAP + int ret; + unsigned long flags; + + if (!cpu_has_x2apic) + return; + + if (!x2apic_preenabled && disable_x2apic) { + printk(KERN_INFO + "Skipped enabling x2apic and Interrupt-remapping " + "because of nox2apic\n"); + return; + } + + if (x2apic_preenabled && disable_x2apic) + panic("Bios already enabled x2apic, can't enforce nox2apic"); + + if (!x2apic_preenabled && skip_ioapic_setup) { + printk(KERN_INFO + "Skipped enabling x2apic and Interrupt-remapping " + "because of skipping io-apic setup\n"); + return; + } + + ret = dmar_table_init(); + if (ret) { + printk(KERN_INFO + "dmar_table_init() failed with %d:\n", ret); + + if (x2apic_preenabled) + panic("x2apic enabled by bios. But IR enabling failed"); + else + printk(KERN_INFO + "Not enabling x2apic,Intr-remapping\n"); + return; + } + + local_irq_save(flags); + mask_8259A(); + save_mask_IO_APIC_setup(); + + ret = enable_intr_remapping(1); + + if (ret && x2apic_preenabled) { + local_irq_restore(flags); + panic("x2apic enabled by bios. But IR enabling failed"); + } + + if (ret) + goto end; + + if (!x2apic) { + x2apic = 1; + apic_ops = &x2apic_ops; + enable_x2apic(); + } +end: + if (ret) + /* + * IR enabling failed + */ + restore_IO_APIC_setup(); + else + reinit_intr_remapped_IO_APIC(x2apic_preenabled); + + unmask_8259A(); + local_irq_restore(flags); + + if (!ret) { + if (!x2apic_preenabled) + printk(KERN_INFO + "Enabled x2apic and interrupt-remapping\n"); + else + printk(KERN_INFO + "Enabled Interrupt-remapping\n"); + } else + printk(KERN_ERR + "Failed to enable Interrupt-remapping and x2apic\n"); +#else + if (!cpu_has_x2apic) + return; + + if (x2apic_preenabled) + panic("x2apic enabled prior OS handover," + " enable CONFIG_INTR_REMAP"); + + printk(KERN_INFO "Enable CONFIG_INTR_REMAP for enabling intr-remapping " + " and x2apic\n"); +#endif + + return; +} +#endif /* HAVE_X2APIC */ + #ifdef CONFIG_X86_64 /* * Detect and enable local APICs on non-SMP boards. @@ -1438,6 +1613,13 @@ void __init early_init_lapic_mapping(void) */ void __init init_apic_mappings(void) { +#ifdef HAVE_X2APIC + if (x2apic) { + boot_cpu_physical_apicid = read_apic_id(); + return; + } +#endif + /* * If no local APIC can be found then set up a fake all * zeroes page to simulate the local APIC and another @@ -1501,6 +1683,7 @@ int __init APIC_init_uniprocessor(void) #ifdef CONFIG_X86_64 setup_apic_routing(); #endif + verify_local_APIC(); connect_bsp_APIC(); @@ -1879,6 +2062,11 @@ static int lapic_resume(struct sys_device *dev) local_irq_save(flags); +#ifdef HAVE_X2APIC + if (x2apic) + enable_x2apic(); + else +#endif { /* * Make sure the APICBASE points to the right address -- cgit v1.2.3-70-g09d2 From 0f611ffaea81b8e1c69682188ba1ccaf7683a2ba Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 24 Aug 2008 02:01:56 -0700 Subject: x86: rename apic_32.c and apic_64.c to apic.c Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/apic.c | 2311 ++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/apic_32.c | 2312 --------------------------------------------- arch/x86/kernel/apic_64.c | 2311 -------------------------------------------- 4 files changed, 2312 insertions(+), 4624 deletions(-) create mode 100644 arch/x86/kernel/apic.c delete mode 100644 arch/x86/kernel/apic_32.c delete mode 100644 arch/x86/kernel/apic_64.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 7264f9c3af8..45cecc59d89 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -60,7 +60,7 @@ obj-$(CONFIG_X86_32_SMP) += smpcommon.o obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o obj-$(CONFIG_X86_MPPARSE) += mpparse.o -obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi.o +obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c new file mode 100644 index 00000000000..6e99af5ce67 --- /dev/null +++ b/arch/x86/kernel/apic.c @@ -0,0 +1,2311 @@ +/* + * Local APIC handling, local APIC timers + * + * (c) 1999, 2000 Ingo Molnar + * + * Fixes + * Maciej W. Rozycki : Bits for genuine 82489DX APICs; + * thanks to Eric Gilmore + * and Rolf G. Tews + * for testing these extensively. + * Maciej W. Rozycki : Various updates and fixes. + * Mikael Pettersson : Power Management for UP-APIC. + * Pavel Machek and + * Mikael Pettersson : PM converted to driver model. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* + * Sanity check + */ +#if ((SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F) +# error SPURIOUS_APIC_VECTOR definition error +#endif + +#ifdef CONFIG_X86_32 +/* + * Knob to control our willingness to enable the local APIC. + * + * +1=force-enable + */ +static int force_enable_local_apic; +/* + * APIC command line parameters + */ +static int __init parse_lapic(char *arg) +{ + force_enable_local_apic = 1; + return 0; +} +early_param("lapic", parse_lapic); +/* Local APIC was disabled by the BIOS and enabled by the kernel */ +static int enabled_via_apicbase; + +#endif + +#ifdef CONFIG_X86_64 +static int apic_calibrate_pmtmr __initdata; +static __init int setup_apicpmtimer(char *s) +{ + apic_calibrate_pmtmr = 1; + notsc_setup(NULL); + return 0; +} +__setup("apicpmtimer", setup_apicpmtimer); +#endif + +#ifdef CONFIG_X86_64 +#define HAVE_X2APIC +#endif + +#ifdef HAVE_X2APIC +int x2apic; +/* x2apic enabled before OS handover */ +int x2apic_preenabled; +int disable_x2apic; +static __init int setup_nox2apic(char *str) +{ + disable_x2apic = 1; + setup_clear_cpu_cap(X86_FEATURE_X2APIC); + return 0; +} +early_param("nox2apic", setup_nox2apic); +#endif + +unsigned long mp_lapic_addr; +int disable_apic; +/* Disable local APIC timer from the kernel commandline or via dmi quirk */ +static int disable_apic_timer __cpuinitdata; +/* Local APIC timer works in C2 */ +int local_apic_timer_c2_ok; +EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); + +int first_system_vector = 0xfe; + +char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; + +/* + * Debug level, exported for io_apic.c + */ +unsigned int apic_verbosity; + +int pic_mode; + +/* Have we found an MP table */ +int smp_found_config; + +static struct resource lapic_resource = { + .name = "Local APIC", + .flags = IORESOURCE_MEM | IORESOURCE_BUSY, +}; + +static unsigned int calibration_result; + +static int lapic_next_event(unsigned long delta, + struct clock_event_device *evt); +static void lapic_timer_setup(enum clock_event_mode mode, + struct clock_event_device *evt); +static void lapic_timer_broadcast(cpumask_t mask); +static void apic_pm_activate(void); + +/* + * The local apic timer can be used for any function which is CPU local. + */ +static struct clock_event_device lapic_clockevent = { + .name = "lapic", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT + | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY, + .shift = 32, + .set_mode = lapic_timer_setup, + .set_next_event = lapic_next_event, + .broadcast = lapic_timer_broadcast, + .rating = 100, + .irq = -1, +}; +static DEFINE_PER_CPU(struct clock_event_device, lapic_events); + +static unsigned long apic_phys; + +/* + * Get the LAPIC version + */ +static inline int lapic_get_version(void) +{ + return GET_APIC_VERSION(apic_read(APIC_LVR)); +} + +/* + * Check, if the APIC is integrated or a separate chip + */ +static inline int lapic_is_integrated(void) +{ +#ifdef CONFIG_X86_64 + return 1; +#else + return APIC_INTEGRATED(lapic_get_version()); +#endif +} + +/* + * Check, whether this is a modern or a first generation APIC + */ +static int modern_apic(void) +{ + /* AMD systems use old APIC versions, so check the CPU */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + boot_cpu_data.x86 >= 0xf) + return 1; + return lapic_get_version() >= 0x14; +} + +/* + * Paravirt kernels also might be using these below ops. So we still + * use generic apic_read()/apic_write(), which might be pointing to different + * ops in PARAVIRT case. + */ +void xapic_wait_icr_idle(void) +{ + while (apic_read(APIC_ICR) & APIC_ICR_BUSY) + cpu_relax(); +} + +u32 safe_xapic_wait_icr_idle(void) +{ + u32 send_status; + int timeout; + + timeout = 0; + do { + send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; + if (!send_status) + break; + udelay(100); + } while (timeout++ < 1000); + + return send_status; +} + +void xapic_icr_write(u32 low, u32 id) +{ + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id)); + apic_write(APIC_ICR, low); +} + +u64 xapic_icr_read(void) +{ + u32 icr1, icr2; + + icr2 = apic_read(APIC_ICR2); + icr1 = apic_read(APIC_ICR); + + return icr1 | ((u64)icr2 << 32); +} + +static struct apic_ops xapic_ops = { + .read = native_apic_mem_read, + .write = native_apic_mem_write, + .icr_read = xapic_icr_read, + .icr_write = xapic_icr_write, + .wait_icr_idle = xapic_wait_icr_idle, + .safe_wait_icr_idle = safe_xapic_wait_icr_idle, +}; + +struct apic_ops __read_mostly *apic_ops = &xapic_ops; +EXPORT_SYMBOL_GPL(apic_ops); + +#ifdef HAVE_X2APIC +static void x2apic_wait_icr_idle(void) +{ + /* no need to wait for icr idle in x2apic */ + return; +} + +static u32 safe_x2apic_wait_icr_idle(void) +{ + /* no need to wait for icr idle in x2apic */ + return 0; +} + +void x2apic_icr_write(u32 low, u32 id) +{ + wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low); +} + +u64 x2apic_icr_read(void) +{ + unsigned long val; + + rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val); + return val; +} + +static struct apic_ops x2apic_ops = { + .read = native_apic_msr_read, + .write = native_apic_msr_write, + .icr_read = x2apic_icr_read, + .icr_write = x2apic_icr_write, + .wait_icr_idle = x2apic_wait_icr_idle, + .safe_wait_icr_idle = safe_x2apic_wait_icr_idle, +}; +#endif + +/** + * enable_NMI_through_LVT0 - enable NMI through local vector table 0 + */ +void __cpuinit enable_NMI_through_LVT0(void) +{ + unsigned int v; + + /* unmask and set to NMI */ + v = APIC_DM_NMI; + + /* Level triggered for 82489DX (32bit mode) */ + if (!lapic_is_integrated()) + v |= APIC_LVT_LEVEL_TRIGGER; + + apic_write(APIC_LVT0, v); +} + +#ifdef CONFIG_X86_32 +/** + * get_physical_broadcast - Get number of physical broadcast IDs + */ +int get_physical_broadcast(void) +{ + return modern_apic() ? 0xff : 0xf; +} +#endif + +/** + * lapic_get_maxlvt - get the maximum number of local vector table entries + */ +int lapic_get_maxlvt(void) +{ + unsigned int v; + + v = apic_read(APIC_LVR); + /* + * - we always have APIC integrated on 64bit mode + * - 82489DXs do not report # of LVT entries + */ + return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; +} + +/* + * Local APIC timer + */ + +/* Clock divisor */ +#ifdef CONFG_X86_64 +#define APIC_DIVISOR 1 +#else +#define APIC_DIVISOR 16 +#endif + +/* + * This function sets up the local APIC timer, with a timeout of + * 'clocks' APIC bus clock. During calibration we actually call + * this function twice on the boot CPU, once with a bogus timeout + * value, second time for real. The other (noncalibrating) CPUs + * call this function only once, with the real, calibrated value. + * + * We do reads before writes even if unnecessary, to get around the + * P5 APIC double write bug. + */ +static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) +{ + unsigned int lvtt_value, tmp_value; + + lvtt_value = LOCAL_TIMER_VECTOR; + if (!oneshot) + lvtt_value |= APIC_LVT_TIMER_PERIODIC; + if (!lapic_is_integrated()) + lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); + + if (!irqen) + lvtt_value |= APIC_LVT_MASKED; + + apic_write(APIC_LVTT, lvtt_value); + + /* + * Divide PICLK by 16 + */ + tmp_value = apic_read(APIC_TDCR); + apic_write(APIC_TDCR, + (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | + APIC_TDR_DIV_16); + + if (!oneshot) + apic_write(APIC_TMICT, clocks / APIC_DIVISOR); +} + +/* + * Setup extended LVT, AMD specific (K8, family 10h) + * + * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and + * MCE interrupts are supported. Thus MCE offset must be set to 0. + * + * If mask=1, the LVT entry does not generate interrupts while mask=0 + * enables the vector. See also the BKDGs. + */ + +#define APIC_EILVT_LVTOFF_MCE 0 +#define APIC_EILVT_LVTOFF_IBS 1 + +static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) +{ + unsigned long reg = (lvt_off << 4) + APIC_EILVT0; + unsigned int v = (mask << 16) | (msg_type << 8) | vector; + + apic_write(reg, v); +} + +u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask) +{ + setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask); + return APIC_EILVT_LVTOFF_MCE; +} + +u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) +{ + setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); + return APIC_EILVT_LVTOFF_IBS; +} +EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs); + +/* + * Program the next event, relative to now + */ +static int lapic_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + apic_write(APIC_TMICT, delta); + return 0; +} + +/* + * Setup the lapic timer in periodic or oneshot mode + */ +static void lapic_timer_setup(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + unsigned long flags; + unsigned int v; + + /* Lapic used as dummy for broadcast ? */ + if (evt->features & CLOCK_EVT_FEAT_DUMMY) + return; + + local_irq_save(flags); + + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + case CLOCK_EVT_MODE_ONESHOT: + __setup_APIC_LVTT(calibration_result, + mode != CLOCK_EVT_MODE_PERIODIC, 1); + break; + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + v = apic_read(APIC_LVTT); + v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); + apic_write(APIC_LVTT, v); + break; + case CLOCK_EVT_MODE_RESUME: + /* Nothing to do here */ + break; + } + + local_irq_restore(flags); +} + +/* + * Local APIC timer broadcast function + */ +static void lapic_timer_broadcast(cpumask_t mask) +{ +#ifdef CONFIG_SMP + send_IPI_mask(mask, LOCAL_TIMER_VECTOR); +#endif +} + +/* + * Setup the local APIC timer for this CPU. Copy the initilized values + * of the boot CPU and register the clock event in the framework. + */ +static void __cpuinit setup_APIC_timer(void) +{ + struct clock_event_device *levt = &__get_cpu_var(lapic_events); + + memcpy(levt, &lapic_clockevent, sizeof(*levt)); + levt->cpumask = cpumask_of_cpu(smp_processor_id()); + + clockevents_register_device(levt); +} + +#ifdef CONFIG_X86_64 +/* + * In this function we calibrate APIC bus clocks to the external + * timer. Unfortunately we cannot use jiffies and the timer irq + * to calibrate, since some later bootup code depends on getting + * the first irq? Ugh. + * + * We want to do the calibration only once since we + * want to have local timer irqs syncron. CPUs connected + * by the same APIC bus have the very same bus frequency. + * And we want to have irqs off anyways, no accidental + * APIC irq that way. + */ + +#define TICK_COUNT 100000000 + +static int __init calibrate_APIC_clock(void) +{ + unsigned apic, apic_start; + unsigned long tsc, tsc_start; + int result; + + local_irq_disable(); + + /* + * Put whatever arbitrary (but long enough) timeout + * value into the APIC clock, we just want to get the + * counter running for calibration. + * + * No interrupt enable ! + */ + __setup_APIC_LVTT(250000000, 0, 0); + + apic_start = apic_read(APIC_TMCCT); +#ifdef CONFIG_X86_PM_TIMER + if (apic_calibrate_pmtmr && pmtmr_ioport) { + pmtimer_wait(5000); /* 5ms wait */ + apic = apic_read(APIC_TMCCT); + result = (apic_start - apic) * 1000L / 5; + } else +#endif + { + rdtscll(tsc_start); + + do { + apic = apic_read(APIC_TMCCT); + rdtscll(tsc); + } while ((tsc - tsc_start) < TICK_COUNT && + (apic_start - apic) < TICK_COUNT); + + result = (apic_start - apic) * 1000L * tsc_khz / + (tsc - tsc_start); + } + + local_irq_enable(); + + printk(KERN_DEBUG "APIC timer calibration result %d\n", result); + + printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", + result / 1000 / 1000, result / 1000 % 1000); + + /* Calculate the scaled math multiplication factor */ + lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC, + lapic_clockevent.shift); + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + + calibration_result = (result * APIC_DIVISOR) / HZ; + + /* + * Do a sanity check on the APIC calibration result + */ + if (calibration_result < (1000000 / HZ)) { + printk(KERN_WARNING + "APIC frequency too slow, disabling apic timer\n"); + return -1; + } + + return 0; +} + +#else +/* + * In this functions we calibrate APIC bus clocks to the external timer. + * + * We want to do the calibration only once since we want to have local timer + * irqs syncron. CPUs connected by the same APIC bus have the very same bus + * frequency. + * + * This was previously done by reading the PIT/HPET and waiting for a wrap + * around to find out, that a tick has elapsed. I have a box, where the PIT + * readout is broken, so it never gets out of the wait loop again. This was + * also reported by others. + * + * Monitoring the jiffies value is inaccurate and the clockevents + * infrastructure allows us to do a simple substitution of the interrupt + * handler. + * + * The calibration routine also uses the pm_timer when possible, as the PIT + * happens to run way too slow (factor 2.3 on my VAIO CoreDuo, which goes + * back to normal later in the boot process). + */ + +#define LAPIC_CAL_LOOPS (HZ/10) + +static __initdata int lapic_cal_loops = -1; +static __initdata long lapic_cal_t1, lapic_cal_t2; +static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2; +static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2; +static __initdata unsigned long lapic_cal_j1, lapic_cal_j2; + +/* + * Temporary interrupt handler. + */ +static void __init lapic_cal_handler(struct clock_event_device *dev) +{ + unsigned long long tsc = 0; + long tapic = apic_read(APIC_TMCCT); + unsigned long pm = acpi_pm_read_early(); + + if (cpu_has_tsc) + rdtscll(tsc); + + switch (lapic_cal_loops++) { + case 0: + lapic_cal_t1 = tapic; + lapic_cal_tsc1 = tsc; + lapic_cal_pm1 = pm; + lapic_cal_j1 = jiffies; + break; + + case LAPIC_CAL_LOOPS: + lapic_cal_t2 = tapic; + lapic_cal_tsc2 = tsc; + if (pm < lapic_cal_pm1) + pm += ACPI_PM_OVRRUN; + lapic_cal_pm2 = pm; + lapic_cal_j2 = jiffies; + break; + } +} + +static int __init calibrate_APIC_clock(void) +{ + struct clock_event_device *levt = &__get_cpu_var(lapic_events); + const long pm_100ms = PMTMR_TICKS_PER_SEC/10; + const long pm_thresh = pm_100ms/100; + void (*real_handler)(struct clock_event_device *dev); + unsigned long deltaj; + long delta, deltapm; + int pm_referenced = 0; + + local_irq_disable(); + + /* Replace the global interrupt handler */ + real_handler = global_clock_event->event_handler; + global_clock_event->event_handler = lapic_cal_handler; + + /* + * Setup the APIC counter to 1e9. There is no way the lapic + * can underflow in the 100ms detection time frame + */ + __setup_APIC_LVTT(1000000000, 0, 0); + + /* Let the interrupts run */ + local_irq_enable(); + + while (lapic_cal_loops <= LAPIC_CAL_LOOPS) + cpu_relax(); + + local_irq_disable(); + + /* Restore the real event handler */ + global_clock_event->event_handler = real_handler; + + /* Build delta t1-t2 as apic timer counts down */ + delta = lapic_cal_t1 - lapic_cal_t2; + apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); + + /* Check, if the PM timer is available */ + deltapm = lapic_cal_pm2 - lapic_cal_pm1; + apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); + + if (deltapm) { + unsigned long mult; + u64 res; + + mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22); + + if (deltapm > (pm_100ms - pm_thresh) && + deltapm < (pm_100ms + pm_thresh)) { + apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); + } else { + res = (((u64) deltapm) * mult) >> 22; + do_div(res, 1000000); + printk(KERN_WARNING "APIC calibration not consistent " + "with PM Timer: %ldms instead of 100ms\n", + (long)res); + /* Correct the lapic counter value */ + res = (((u64) delta) * pm_100ms); + do_div(res, deltapm); + printk(KERN_INFO "APIC delta adjusted to PM-Timer: " + "%lu (%ld)\n", (unsigned long) res, delta); + delta = (long) res; + } + pm_referenced = 1; + } + + /* Calculate the scaled math multiplication factor */ + lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, + lapic_clockevent.shift); + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + + calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; + + apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); + apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult); + apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", + calibration_result); + + if (cpu_has_tsc) { + delta = (long)(lapic_cal_tsc2 - lapic_cal_tsc1); + apic_printk(APIC_VERBOSE, "..... CPU clock speed is " + "%ld.%04ld MHz.\n", + (delta / LAPIC_CAL_LOOPS) / (1000000 / HZ), + (delta / LAPIC_CAL_LOOPS) % (1000000 / HZ)); + } + + apic_printk(APIC_VERBOSE, "..... host bus clock speed is " + "%u.%04u MHz.\n", + calibration_result / (1000000 / HZ), + calibration_result % (1000000 / HZ)); + + /* + * Do a sanity check on the APIC calibration result + */ + if (calibration_result < (1000000 / HZ)) { + local_irq_enable(); + printk(KERN_WARNING + "APIC frequency too slow, disabling apic timer\n"); + return -1; + } + + levt->features &= ~CLOCK_EVT_FEAT_DUMMY; + + /* We trust the pm timer based calibration */ + if (!pm_referenced) { + apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); + + /* + * Setup the apic timer manually + */ + levt->event_handler = lapic_cal_handler; + lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt); + lapic_cal_loops = -1; + + /* Let the interrupts run */ + local_irq_enable(); + + while (lapic_cal_loops <= LAPIC_CAL_LOOPS) + cpu_relax(); + + local_irq_disable(); + + /* Stop the lapic timer */ + lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt); + + local_irq_enable(); + + /* Jiffies delta */ + deltaj = lapic_cal_j2 - lapic_cal_j1; + apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj); + + /* Check, if the jiffies result is consistent */ + if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) + apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); + else + levt->features |= CLOCK_EVT_FEAT_DUMMY; + } else + local_irq_enable(); + + if (levt->features & CLOCK_EVT_FEAT_DUMMY) { + printk(KERN_WARNING + "APIC timer disabled due to verification failure.\n"); + return -1; + } + + return 0; +} + +#endif + +/* + * Setup the boot APIC + * + * Calibrate and verify the result. + */ +void __init setup_boot_APIC_clock(void) +{ + /* + * The local apic timer can be disabled via the kernel + * commandline or from the CPU detection code. Register the lapic + * timer as a dummy clock event source on SMP systems, so the + * broadcast mechanism is used. On UP systems simply ignore it. + */ + if (disable_apic_timer) { + printk(KERN_INFO "Disabling APIC timer\n"); + /* No broadcast on UP ! */ + if (num_possible_cpus() > 1) { + lapic_clockevent.mult = 1; + setup_APIC_timer(); + } + return; + } + + apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" + "calibrating APIC timer ...\n"); + + if (calibrate_APIC_clock()) { + /* No broadcast on UP ! */ + if (num_possible_cpus() > 1) + setup_APIC_timer(); + return; + } + + /* + * If nmi_watchdog is set to IO_APIC, we need the + * PIT/HPET going. Otherwise register lapic as a dummy + * device. + */ + if (nmi_watchdog != NMI_IO_APIC) + lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; + else + printk(KERN_WARNING "APIC timer registered as dummy," + " due to nmi_watchdog=%d!\n", nmi_watchdog); + + /* Setup the lapic or request the broadcast */ + setup_APIC_timer(); +} + +void __cpuinit setup_secondary_APIC_clock(void) +{ + setup_APIC_timer(); +} + +/* + * The guts of the apic timer interrupt + */ +static void local_apic_timer_interrupt(void) +{ + int cpu = smp_processor_id(); + struct clock_event_device *evt = &per_cpu(lapic_events, cpu); + + /* + * Normally we should not be here till LAPIC has been initialized but + * in some cases like kdump, its possible that there is a pending LAPIC + * timer interrupt from previous kernel's context and is delivered in + * new kernel the moment interrupts are enabled. + * + * Interrupts are enabled early and LAPIC is setup much later, hence + * its possible that when we get here evt->event_handler is NULL. + * Check for event_handler being NULL and discard the interrupt as + * spurious. + */ + if (!evt->event_handler) { + printk(KERN_WARNING + "Spurious LAPIC timer interrupt on cpu %d\n", cpu); + /* Switch it off */ + lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt); + return; + } + + /* + * the NMI deadlock-detector uses this. + */ +#ifdef CONFIG_X86_64 + add_pda(apic_timer_irqs, 1); +#else + per_cpu(irq_stat, cpu).apic_timer_irqs++; +#endif + + evt->event_handler(evt); +} + +/* + * Local APIC timer interrupt. This is the most natural way for doing + * local interrupts, but local timer interrupts can be emulated by + * broadcast interrupts too. [in case the hw doesn't support APIC timers] + * + * [ if a single-CPU system runs an SMP kernel then we call the local + * interrupt as well. Thus we cannot inline the local irq ... ] + */ +void smp_apic_timer_interrupt(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + /* + * NOTE! We'd better ACK the irq immediately, + * because timer handling can be slow. + */ + ack_APIC_irq(); + /* + * update_process_times() expects us to have done irq_enter(). + * Besides, if we don't timer interrupts ignore the global + * interrupt lock, which is the WrongThing (tm) to do. + */ +#ifdef CONFIG_X86_64 + exit_idle(); +#endif + irq_enter(); + local_apic_timer_interrupt(); + irq_exit(); + + set_irq_regs(old_regs); +} + +int setup_profiling_timer(unsigned int multiplier) +{ + return -EINVAL; +} + +/* + * Local APIC start and shutdown + */ + +/** + * clear_local_APIC - shutdown the local APIC + * + * This is called, when a CPU is disabled and before rebooting, so the state of + * the local APIC has no dangling leftovers. Also used to cleanout any BIOS + * leftovers during boot. + */ +void clear_local_APIC(void) +{ + int maxlvt; + u32 v; + + /* APIC hasn't been mapped yet */ + if (!apic_phys) + return; + + maxlvt = lapic_get_maxlvt(); + /* + * Masking an LVT entry can trigger a local APIC error + * if the vector is zero. Mask LVTERR first to prevent this. + */ + if (maxlvt >= 3) { + v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ + apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); + } + /* + * Careful: we have to set masks only first to deassert + * any level-triggered sources. + */ + v = apic_read(APIC_LVTT); + apic_write(APIC_LVTT, v | APIC_LVT_MASKED); + v = apic_read(APIC_LVT0); + apic_write(APIC_LVT0, v | APIC_LVT_MASKED); + v = apic_read(APIC_LVT1); + apic_write(APIC_LVT1, v | APIC_LVT_MASKED); + if (maxlvt >= 4) { + v = apic_read(APIC_LVTPC); + apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); + } + + /* lets not touch this if we didn't frob it */ +#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL) + if (maxlvt >= 5) { + v = apic_read(APIC_LVTTHMR); + apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); + } +#endif + /* + * Clean APIC state for other OSs: + */ + apic_write(APIC_LVTT, APIC_LVT_MASKED); + apic_write(APIC_LVT0, APIC_LVT_MASKED); + apic_write(APIC_LVT1, APIC_LVT_MASKED); + if (maxlvt >= 3) + apic_write(APIC_LVTERR, APIC_LVT_MASKED); + if (maxlvt >= 4) + apic_write(APIC_LVTPC, APIC_LVT_MASKED); + + /* Integrated APIC (!82489DX) ? */ + if (lapic_is_integrated()) { + if (maxlvt > 3) + /* Clear ESR due to Pentium errata 3AP and 11AP */ + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + } +} + +/** + * disable_local_APIC - clear and disable the local APIC + */ +void disable_local_APIC(void) +{ + unsigned int value; + + clear_local_APIC(); + + /* + * Disable APIC (implies clearing of registers + * for 82489DX!). + */ + value = apic_read(APIC_SPIV); + value &= ~APIC_SPIV_APIC_ENABLED; + apic_write(APIC_SPIV, value); + +#ifdef CONFIG_X86_32 + /* + * When LAPIC was disabled by the BIOS and enabled by the kernel, + * restore the disabled state. + */ + if (enabled_via_apicbase) { + unsigned int l, h; + + rdmsr(MSR_IA32_APICBASE, l, h); + l &= ~MSR_IA32_APICBASE_ENABLE; + wrmsr(MSR_IA32_APICBASE, l, h); + } +#endif +} + +/* + * If Linux enabled the LAPIC against the BIOS default disable it down before + * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and + * not power-off. Additionally clear all LVT entries before disable_local_APIC + * for the case where Linux didn't enable the LAPIC. + */ +void lapic_shutdown(void) +{ + unsigned long flags; + + if (!cpu_has_apic) + return; + + local_irq_save(flags); + +#ifdef CONFIG_X86_32 + if (!enabled_via_apicbase) + clear_local_APIC(); + else +#endif + disable_local_APIC(); + + + local_irq_restore(flags); +} + +/* + * This is to verify that we're looking at a real local APIC. + * Check these against your board if the CPUs aren't getting + * started for no apparent reason. + */ +int __init verify_local_APIC(void) +{ + unsigned int reg0, reg1; + + /* + * The version register is read-only in a real APIC. + */ + reg0 = apic_read(APIC_LVR); + apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0); + apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK); + reg1 = apic_read(APIC_LVR); + apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1); + + /* + * The two version reads above should print the same + * numbers. If the second one is different, then we + * poke at a non-APIC. + */ + if (reg1 != reg0) + return 0; + + /* + * Check if the version looks reasonably. + */ + reg1 = GET_APIC_VERSION(reg0); + if (reg1 == 0x00 || reg1 == 0xff) + return 0; + reg1 = lapic_get_maxlvt(); + if (reg1 < 0x02 || reg1 == 0xff) + return 0; + + /* + * The ID register is read/write in a real APIC. + */ + reg0 = apic_read(APIC_ID); + apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); + apic_write(APIC_ID, reg0 ^ APIC_ID_MASK); + reg1 = apic_read(APIC_ID); + apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); + apic_write(APIC_ID, reg0); + if (reg1 != (reg0 ^ APIC_ID_MASK)) + return 0; + + /* + * The next two are just to see if we have sane values. + * They're only really relevant if we're in Virtual Wire + * compatibility mode, but most boxes are anymore. + */ + reg0 = apic_read(APIC_LVT0); + apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0); + reg1 = apic_read(APIC_LVT1); + apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); + + return 1; +} + +/** + * sync_Arb_IDs - synchronize APIC bus arbitration IDs + */ +void __init sync_Arb_IDs(void) +{ + /* + * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not + * needed on AMD. + */ + if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return; + + /* + * Wait for idle. + */ + apic_wait_icr_idle(); + + apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); + apic_write(APIC_ICR, APIC_DEST_ALLINC | + APIC_INT_LEVELTRIG | APIC_DM_INIT); +} + +/* + * An initial setup of the virtual wire mode. + */ +void __init init_bsp_APIC(void) +{ + unsigned int value; + + /* + * Don't do the setup now if we have a SMP BIOS as the + * through-I/O-APIC virtual wire mode might be active. + */ + if (smp_found_config || !cpu_has_apic) + return; + + /* + * Do not trust the local APIC being empty at bootup. + */ + clear_local_APIC(); + + /* + * Enable APIC. + */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + +#ifdef CONFIG_X86_32 + /* This bit is reserved on P4/Xeon and should be cleared */ + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && + (boot_cpu_data.x86 == 15)) + value &= ~APIC_SPIV_FOCUS_DISABLED; + else +#endif + value |= APIC_SPIV_FOCUS_DISABLED; + value |= SPURIOUS_APIC_VECTOR; + apic_write(APIC_SPIV, value); + + /* + * Set up the virtual wire mode. + */ + apic_write(APIC_LVT0, APIC_DM_EXTINT); + value = APIC_DM_NMI; + if (!lapic_is_integrated()) /* 82489DX */ + value |= APIC_LVT_LEVEL_TRIGGER; + apic_write(APIC_LVT1, value); +} + +static void __cpuinit lapic_setup_esr(void) +{ + unsigned long oldvalue, value, maxlvt; + if (lapic_is_integrated() && !esr_disable) { + if (esr_disable) { + /* + * Something untraceable is creating bad interrupts on + * secondary quads ... for the moment, just leave the + * ESR disabled - we can't do anything useful with the + * errors anyway - mbligh + */ + printk(KERN_INFO "Leaving ESR disabled.\n"); + return; + } + /* !82489DX */ + maxlvt = lapic_get_maxlvt(); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + oldvalue = apic_read(APIC_ESR); + + /* enables sending errors */ + value = ERROR_APIC_VECTOR; + apic_write(APIC_LVTERR, value); + /* + * spec says clear errors after enabling vector. + */ + if (maxlvt > 3) + apic_write(APIC_ESR, 0); + value = apic_read(APIC_ESR); + if (value != oldvalue) + apic_printk(APIC_VERBOSE, "ESR value before enabling " + "vector: 0x%08lx after: 0x%08lx\n", + oldvalue, value); + } else { + printk(KERN_INFO "No ESR for 82489DX.\n"); + } +} + + +/** + * setup_local_APIC - setup the local APIC + */ +void __cpuinit setup_local_APIC(void) +{ + unsigned int value; + int i, j; + +#ifdef CONFIG_X86_32 + /* Pound the ESR really hard over the head with a big hammer - mbligh */ + if (esr_disable) { + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + } +#endif + + preempt_disable(); + + /* + * Double-check whether this APIC is really registered. + * This is meaningless in clustered apic mode, so we skip it. + */ + if (!apic_id_registered()) + BUG(); + + /* + * Intel recommends to set DFR, LDR and TPR before enabling + * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel + * document number 292116). So here it goes... + */ + init_apic_ldr(); + + /* + * Set Task Priority to 'accept all'. We never change this + * later on. + */ + value = apic_read(APIC_TASKPRI); + value &= ~APIC_TPRI_MASK; + apic_write(APIC_TASKPRI, value); + + /* + * After a crash, we no longer service the interrupts and a pending + * interrupt from previous kernel might still have ISR bit set. + * + * Most probably by now CPU has serviced that pending interrupt and + * it might not have done the ack_APIC_irq() because it thought, + * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it + * does not clear the ISR bit and cpu thinks it has already serivced + * the interrupt. Hence a vector might get locked. It was noticed + * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. + */ + for (i = APIC_ISR_NR - 1; i >= 0; i--) { + value = apic_read(APIC_ISR + i*0x10); + for (j = 31; j >= 0; j--) { + if (value & (1< 1) || + (boot_cpu_data.x86 == 15)) + break; + goto no_apic; + case X86_VENDOR_INTEL: + if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 || + (boot_cpu_data.x86 == 5 && cpu_has_apic)) + break; + goto no_apic; + default: + goto no_apic; + } + + if (!cpu_has_apic) { + /* + * Over-ride BIOS and try to enable the local APIC only if + * "lapic" specified. + */ + if (!force_enable_local_apic) { + printk(KERN_INFO "Local APIC disabled by BIOS -- " + "you can enable it with \"lapic\"\n"); + return -1; + } + /* + * Some BIOSes disable the local APIC in the APIC_BASE + * MSR. This can only be done in software for Intel P6 or later + * and AMD K7 (Model > 1) or later. + */ + rdmsr(MSR_IA32_APICBASE, l, h); + if (!(l & MSR_IA32_APICBASE_ENABLE)) { + printk(KERN_INFO + "Local APIC disabled by BIOS -- reenabling.\n"); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; + wrmsr(MSR_IA32_APICBASE, l, h); + enabled_via_apicbase = 1; + } + } + /* + * The APIC feature bit should now be enabled + * in `cpuid' + */ + features = cpuid_edx(1); + if (!(features & (1 << X86_FEATURE_APIC))) { + printk(KERN_WARNING "Could not enable APIC!\n"); + return -1; + } + set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + + /* The BIOS may have set up the APIC at some other address */ + rdmsr(MSR_IA32_APICBASE, l, h); + if (l & MSR_IA32_APICBASE_ENABLE) + mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; + + printk(KERN_INFO "Found and enabled local APIC!\n"); + + apic_pm_activate(); + + return 0; + +no_apic: + printk(KERN_INFO "No local APIC present or hardware disabled\n"); + return -1; +} +#endif + +#ifdef CONFIG_X86_64 +void __init early_init_lapic_mapping(void) +{ + unsigned long phys_addr; + + /* + * If no local APIC can be found then go out + * : it means there is no mpatable and MADT + */ + if (!smp_found_config) + return; + + phys_addr = mp_lapic_addr; + + set_fixmap_nocache(FIX_APIC_BASE, phys_addr); + apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", + APIC_BASE, phys_addr); + + /* + * Fetch the APIC ID of the BSP in case we have a + * default configuration (or the MP table is broken). + */ + boot_cpu_physical_apicid = read_apic_id(); +} +#endif + +/** + * init_apic_mappings - initialize APIC mappings + */ +void __init init_apic_mappings(void) +{ +#ifdef HAVE_X2APIC + if (x2apic) { + boot_cpu_physical_apicid = read_apic_id(); + return; + } +#endif + + /* + * If no local APIC can be found then set up a fake all + * zeroes page to simulate the local APIC and another + * one for the IO-APIC. + */ + if (!smp_found_config && detect_init_APIC()) { + apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); + apic_phys = __pa(apic_phys); + } else + apic_phys = mp_lapic_addr; + + set_fixmap_nocache(FIX_APIC_BASE, apic_phys); + apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", + APIC_BASE, apic_phys); + + /* + * Fetch the APIC ID of the BSP in case we have a + * default configuration (or the MP table is broken). + */ + if (boot_cpu_physical_apicid == -1U) + boot_cpu_physical_apicid = read_apic_id(); +} + +/* + * This initializes the IO-APIC and APIC hardware if this is + * a UP kernel. + */ +int apic_version[MAX_APICS]; + +int __init APIC_init_uniprocessor(void) +{ +#ifdef CONFIG_X86_64 + if (disable_apic) { + printk(KERN_INFO "Apic disabled\n"); + return -1; + } + if (!cpu_has_apic) { + disable_apic = 1; + printk(KERN_INFO "Apic disabled by BIOS\n"); + return -1; + } +#else + if (!smp_found_config && !cpu_has_apic) + return -1; + + /* + * Complain if the BIOS pretends there is one. + */ + if (!cpu_has_apic && + APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", + boot_cpu_physical_apicid); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + return -1; + } +#endif + +#ifdef HAVE_X2APIC + enable_IR_x2apic(); +#endif +#ifdef CONFIG_X86_64 + setup_apic_routing(); +#endif + + verify_local_APIC(); + connect_bsp_APIC(); + +#ifdef CONFIG_X86_64 + apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid)); +#else + /* + * Hack: In case of kdump, after a crash, kernel might be booting + * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid + * might be zero if read from MP tables. Get it from LAPIC. + */ +# ifdef CONFIG_CRASH_DUMP + boot_cpu_physical_apicid = read_apic_id(); +# endif +#endif + physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); + setup_local_APIC(); + +#ifdef CONFIG_X86_64 + /* + * Now enable IO-APICs, actually call clear_IO_APIC + * We need clear_IO_APIC before enabling vector on BP + */ + if (!skip_ioapic_setup && nr_ioapics) + enable_IO_APIC(); +#endif + +#ifdef CONFIG_X86_IO_APIC + if (!smp_found_config || skip_ioapic_setup || !nr_ioapics) +#endif + localise_nmi_watchdog(); + end_local_APIC_setup(); + +#ifdef CONFIG_X86_IO_APIC + if (smp_found_config && !skip_ioapic_setup && nr_ioapics) + setup_IO_APIC(); +# ifdef CONFIG_X86_64 + else + nr_ioapics = 0; +# endif +#endif + +#ifdef CONFIG_X86_64 + setup_boot_APIC_clock(); + check_nmi_watchdog(); +#else + setup_boot_clock(); +#endif + + return 0; +} + +/* + * Local APIC interrupts + */ + +/* + * This interrupt should _never_ happen with our APIC/SMP architecture + */ +#ifdef CONFIG_X86_64 +asmlinkage void smp_spurious_interrupt(void) +#else +void smp_spurious_interrupt(struct pt_regs *regs) +#endif +{ + u32 v; + +#ifdef CONFIG_X86_64 + exit_idle(); +#endif + irq_enter(); + /* + * Check if this really is a spurious interrupt and ACK it + * if it is a vectored one. Just in case... + * Spurious interrupts should not be ACKed. + */ + v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); + if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) + ack_APIC_irq(); + +#ifdef CONFIG_X86_64 + add_pda(irq_spurious_count, 1); +#else + /* see sw-dev-man vol 3, chapter 7.4.13.5 */ + printk(KERN_INFO "spurious APIC interrupt on CPU#%d, " + "should never happen.\n", smp_processor_id()); + __get_cpu_var(irq_stat).irq_spurious_count++; +#endif + irq_exit(); +} + +/* + * This interrupt should never happen with our APIC/SMP architecture + */ +#ifdef CONFIG_X86_64 +asmlinkage void smp_error_interrupt(void) +#else +void smp_error_interrupt(struct pt_regs *regs) +#endif +{ + u32 v, v1; + +#ifdef CONFIG_X86_64 + exit_idle(); +#endif + irq_enter(); + /* First tickle the hardware, only then report what went on. -- REW */ + v = apic_read(APIC_ESR); + apic_write(APIC_ESR, 0); + v1 = apic_read(APIC_ESR); + ack_APIC_irq(); + atomic_inc(&irq_err_count); + + /* Here is what the APIC error bits mean: + 0: Send CS error + 1: Receive CS error + 2: Send accept error + 3: Receive accept error + 4: Reserved + 5: Send illegal vector + 6: Received illegal vector + 7: Illegal register address + */ + printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", + smp_processor_id(), v , v1); + irq_exit(); +} + +/** + * connect_bsp_APIC - attach the APIC to the interrupt system + */ +void __init connect_bsp_APIC(void) +{ +#ifdef CONFIG_X86_32 + if (pic_mode) { + /* + * Do not trust the local APIC being empty at bootup. + */ + clear_local_APIC(); + /* + * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's + * local APIC to INT and NMI lines. + */ + apic_printk(APIC_VERBOSE, "leaving PIC mode, " + "enabling APIC mode.\n"); + outb(0x70, 0x22); + outb(0x01, 0x23); + } +#endif + enable_apic_mode(); +} + +/** + * disconnect_bsp_APIC - detach the APIC from the interrupt system + * @virt_wire_setup: indicates, whether virtual wire mode is selected + * + * Virtual wire mode is necessary to deliver legacy interrupts even when the + * APIC is disabled. + */ +void disconnect_bsp_APIC(int virt_wire_setup) +{ + unsigned int value; + +#ifdef CONFIG_X86_32 + if (pic_mode) { + /* + * Put the board back into PIC mode (has an effect only on + * certain older boards). Note that APIC interrupts, including + * IPIs, won't work beyond this point! The only exception are + * INIT IPIs. + */ + apic_printk(APIC_VERBOSE, "disabling APIC mode, " + "entering PIC mode.\n"); + outb(0x70, 0x22); + outb(0x00, 0x23); + return; + } +#endif + + /* Go back to Virtual Wire compatibility mode */ + + /* For the spurious interrupt use vector F, and enable it */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + value |= 0xf; + apic_write(APIC_SPIV, value); + + if (!virt_wire_setup) { + /* + * For LVT0 make it edge triggered, active high, + * external and enabled + */ + value = apic_read(APIC_LVT0); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); + apic_write(APIC_LVT0, value); + } else { + /* Disable LVT0 */ + apic_write(APIC_LVT0, APIC_LVT_MASKED); + } + + /* + * For LVT1 make it edge triggered, active high, + * nmi and enabled + */ + value = apic_read(APIC_LVT1); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); + apic_write(APIC_LVT1, value); +} + +void __cpuinit generic_processor_info(int apicid, int version) +{ + int cpu; + cpumask_t tmp_map; + + /* + * Validate version + */ + if (version == 0x0) { + printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! " + "fixing up to 0x10. (tell your hw vendor)\n", + version); + version = 0x10; + } + apic_version[apicid] = version; + + if (num_processors >= NR_CPUS) { + printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." + " Processor ignored.\n", NR_CPUS); + return; + } + + num_processors++; + cpus_complement(tmp_map, cpu_present_map); + cpu = first_cpu(tmp_map); + + physid_set(apicid, phys_cpu_present_map); + if (apicid == boot_cpu_physical_apicid) { + /* + * x86_bios_cpu_apicid is required to have processors listed + * in same order as logical cpu numbers. Hence the first + * entry is BSP, and so on. + */ + cpu = 0; + } + if (apicid > max_physical_apicid) + max_physical_apicid = apicid; + +#ifdef CONFIG_X86_32 + /* + * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y + * but we need to work other dependencies like SMP_SUSPEND etc + * before this can be done without some confusion. + * if (CPU_HOTPLUG_ENABLED || num_processors > 8) + * - Ashok Raj + */ + if (max_physical_apicid >= 8) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + if (!APIC_XAPIC(version)) { + def_to_bigsmp = 0; + break; + } + /* If P4 and above fall through */ + case X86_VENDOR_AMD: + def_to_bigsmp = 1; + } + } +#endif + +#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64) + /* are we being called early in kernel startup? */ + if (early_per_cpu_ptr(x86_cpu_to_apicid)) { + u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); + u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); + + cpu_to_apicid[cpu] = apicid; + bios_cpu_apicid[cpu] = apicid; + } else { + per_cpu(x86_cpu_to_apicid, cpu) = apicid; + per_cpu(x86_bios_cpu_apicid, cpu) = apicid; + } +#endif + + cpu_set(cpu, cpu_possible_map); + cpu_set(cpu, cpu_present_map); +} + +#ifdef CONFIG_X86_64 +int hard_smp_processor_id(void) +{ + return read_apic_id(); +} +#endif + +/* + * Power management + */ +#ifdef CONFIG_PM + +static struct { + /* + * 'active' is true if the local APIC was enabled by us and + * not the BIOS; this signifies that we are also responsible + * for disabling it before entering apm/acpi suspend + */ + int active; + /* r/w apic fields */ + unsigned int apic_id; + unsigned int apic_taskpri; + unsigned int apic_ldr; + unsigned int apic_dfr; + unsigned int apic_spiv; + unsigned int apic_lvtt; + unsigned int apic_lvtpc; + unsigned int apic_lvt0; + unsigned int apic_lvt1; + unsigned int apic_lvterr; + unsigned int apic_tmict; + unsigned int apic_tdcr; + unsigned int apic_thmr; +} apic_pm_state; + +static int lapic_suspend(struct sys_device *dev, pm_message_t state) +{ + unsigned long flags; + int maxlvt; + + if (!apic_pm_state.active) + return 0; + + maxlvt = lapic_get_maxlvt(); + + apic_pm_state.apic_id = apic_read(APIC_ID); + apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); + apic_pm_state.apic_ldr = apic_read(APIC_LDR); + apic_pm_state.apic_dfr = apic_read(APIC_DFR); + apic_pm_state.apic_spiv = apic_read(APIC_SPIV); + apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); + if (maxlvt >= 4) + apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); + apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); + apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); + apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); + apic_pm_state.apic_tmict = apic_read(APIC_TMICT); + apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); +#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) + if (maxlvt >= 5) + apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); +#endif + + local_irq_save(flags); + disable_local_APIC(); + local_irq_restore(flags); + return 0; +} + +static int lapic_resume(struct sys_device *dev) +{ + unsigned int l, h; + unsigned long flags; + int maxlvt; + + if (!apic_pm_state.active) + return 0; + + maxlvt = lapic_get_maxlvt(); + + local_irq_save(flags); + +#ifdef HAVE_X2APIC + if (x2apic) + enable_x2apic(); + else +#endif + { + /* + * Make sure the APICBASE points to the right address + * + * FIXME! This will be wrong if we ever support suspend on + * SMP! We'll need to do this as part of the CPU restore! + */ + rdmsr(MSR_IA32_APICBASE, l, h); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; + wrmsr(MSR_IA32_APICBASE, l, h); + } + + apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); + apic_write(APIC_ID, apic_pm_state.apic_id); + apic_write(APIC_DFR, apic_pm_state.apic_dfr); + apic_write(APIC_LDR, apic_pm_state.apic_ldr); + apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); + apic_write(APIC_SPIV, apic_pm_state.apic_spiv); + apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); + apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); +#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) + if (maxlvt >= 5) + apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); +#endif + if (maxlvt >= 4) + apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); + apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); + apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); + apic_write(APIC_TMICT, apic_pm_state.apic_tmict); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + + local_irq_restore(flags); + + return 0; +} + +/* + * This device has no shutdown method - fully functioning local APICs + * are needed on every CPU up until machine_halt/restart/poweroff. + */ + +static struct sysdev_class lapic_sysclass = { + .name = "lapic", + .resume = lapic_resume, + .suspend = lapic_suspend, +}; + +static struct sys_device device_lapic = { + .id = 0, + .cls = &lapic_sysclass, +}; + +static void __cpuinit apic_pm_activate(void) +{ + apic_pm_state.active = 1; +} + +static int __init init_lapic_sysfs(void) +{ + int error; + + if (!cpu_has_apic) + return 0; + /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ + + error = sysdev_class_register(&lapic_sysclass); + if (!error) + error = sysdev_register(&device_lapic); + return error; +} +device_initcall(init_lapic_sysfs); + +#else /* CONFIG_PM */ + +static void apic_pm_activate(void) { } + +#endif /* CONFIG_PM */ + +#ifdef CONFIG_X86_64 +/* + * apic_is_clustered_box() -- Check if we can expect good TSC + * + * Thus far, the major user of this is IBM's Summit2 series: + * + * Clustered boxes may have unsynced TSC problems if they are + * multi-chassis. Use available data to take a good guess. + * If in doubt, go HPET. + */ +__cpuinit int apic_is_clustered_box(void) +{ + int i, clusters, zeros; + unsigned id; + u16 *bios_cpu_apicid; + DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); + + /* + * there is not this kind of box with AMD CPU yet. + * Some AMD box with quadcore cpu and 8 sockets apicid + * will be [4, 0x23] or [8, 0x27] could be thought to + * vsmp box still need checking... + */ + if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box()) + return 0; + + bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); + bitmap_zero(clustermap, NUM_APIC_CLUSTERS); + + for (i = 0; i < NR_CPUS; i++) { + /* are we being called early in kernel startup? */ + if (bios_cpu_apicid) { + id = bios_cpu_apicid[i]; + } + else if (i < nr_cpu_ids) { + if (cpu_present(i)) + id = per_cpu(x86_bios_cpu_apicid, i); + else + continue; + } + else + break; + + if (id != BAD_APICID) + __set_bit(APIC_CLUSTERID(id), clustermap); + } + + /* Problem: Partially populated chassis may not have CPUs in some of + * the APIC clusters they have been allocated. Only present CPUs have + * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap. + * Since clusters are allocated sequentially, count zeros only if + * they are bounded by ones. + */ + clusters = 0; + zeros = 0; + for (i = 0; i < NUM_APIC_CLUSTERS; i++) { + if (test_bit(i, clustermap)) { + clusters += 1 + zeros; + zeros = 0; + } else + ++zeros; + } + + /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are + * not guaranteed to be synced between boards + */ + if (is_vsmp_box() && clusters > 1) + return 1; + + /* + * If clusters > 2, then should be multi-chassis. + * May have to revisit this when multi-core + hyperthreaded CPUs come + * out, but AFAIK this will work even for them. + */ + return (clusters > 2); +} +#endif + +/* + * APIC command line parameters + */ +static int __init setup_disableapic(char *arg) +{ + disable_apic = 1; + setup_clear_cpu_cap(X86_FEATURE_APIC); + return 0; +} +early_param("disableapic", setup_disableapic); + +/* same as disableapic, for compatibility */ +static int __init setup_nolapic(char *arg) +{ + return setup_disableapic(arg); +} +early_param("nolapic", setup_nolapic); + +static int __init parse_lapic_timer_c2_ok(char *arg) +{ + local_apic_timer_c2_ok = 1; + return 0; +} +early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); + +static int __init parse_disable_apic_timer(char *arg) +{ + disable_apic_timer = 1; + return 0; +} +early_param("noapictimer", parse_disable_apic_timer); + +static int __init parse_nolapic_timer(char *arg) +{ + disable_apic_timer = 1; + return 0; +} +early_param("nolapic_timer", parse_nolapic_timer); + +static int __init apic_set_verbosity(char *arg) +{ + if (!arg) { +#ifdef CONFIG_X86_64 + skip_ioapic_setup = 0; + return 0; +#endif + return -EINVAL; + } + + if (strcmp("debug", arg) == 0) + apic_verbosity = APIC_DEBUG; + else if (strcmp("verbose", arg) == 0) + apic_verbosity = APIC_VERBOSE; + else { + printk(KERN_WARNING "APIC Verbosity level %s not recognised" + " use apic=verbose or apic=debug\n", arg); + return -EINVAL; + } + + return 0; +} +early_param("apic", apic_set_verbosity); + +static int __init lapic_insert_resource(void) +{ + if (!apic_phys) + return -1; + + /* Put local APIC into the resource map. */ + lapic_resource.start = apic_phys; + lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; + insert_resource(&iomem_resource, &lapic_resource); + + return 0; +} + +/* + * need call insert after e820_reserve_resources() + * that is using request_resource + */ +late_initcall(lapic_insert_resource); diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c deleted file mode 100644 index 0b11d6e3f09..00000000000 --- a/arch/x86/kernel/apic_32.c +++ /dev/null @@ -1,2312 +0,0 @@ -/* - * Local APIC handling, local APIC timers - * - * (c) 1999, 2000 Ingo Molnar - * - * Fixes - * Maciej W. Rozycki : Bits for genuine 82489DX APICs; - * thanks to Eric Gilmore - * and Rolf G. Tews - * for testing these extensively. - * Maciej W. Rozycki : Various updates and fixes. - * Mikael Pettersson : Power Management for UP-APIC. - * Pavel Machek and - * Mikael Pettersson : PM converted to driver model. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -/* - * Sanity check - */ -#if ((SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F) -# error SPURIOUS_APIC_VECTOR definition error -#endif - -#ifdef CONFIG_X86_32 -/* - * Knob to control our willingness to enable the local APIC. - * - * +1=force-enable - */ -static int force_enable_local_apic; -/* - * APIC command line parameters - */ -static int __init parse_lapic(char *arg) -{ - force_enable_local_apic = 1; - return 0; -} -early_param("lapic", parse_lapic); -/* Local APIC was disabled by the BIOS and enabled by the kernel */ -static int enabled_via_apicbase; - -#endif - -#ifdef CONFIG_X86_64 -static int apic_calibrate_pmtmr __initdata; -static __init int setup_apicpmtimer(char *s) -{ - apic_calibrate_pmtmr = 1; - notsc_setup(NULL); - return 0; -} -__setup("apicpmtimer", setup_apicpmtimer); -#endif - -#ifdef CONFIG_X86_64 -#define HAVE_X2APIC -#endif - -#ifdef HAVE_X2APIC -int x2apic; -/* x2apic enabled before OS handover */ -int x2apic_preenabled; -int disable_x2apic; -static __init int setup_nox2apic(char *str) -{ - disable_x2apic = 1; - setup_clear_cpu_cap(X86_FEATURE_X2APIC); - return 0; -} -early_param("nox2apic", setup_nox2apic); -#endif - -unsigned long mp_lapic_addr; -int disable_apic; -/* Disable local APIC timer from the kernel commandline or via dmi quirk */ -static int disable_apic_timer __cpuinitdata; -/* Local APIC timer works in C2 */ -int local_apic_timer_c2_ok; -EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); - -int first_system_vector = 0xfe; - -char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; - -/* - * Debug level, exported for io_apic.c - */ -unsigned int apic_verbosity; - -int pic_mode; - -/* Have we found an MP table */ -int smp_found_config; - -static struct resource lapic_resource = { - .name = "Local APIC", - .flags = IORESOURCE_MEM | IORESOURCE_BUSY, -}; - -static unsigned int calibration_result; - -static int lapic_next_event(unsigned long delta, - struct clock_event_device *evt); -static void lapic_timer_setup(enum clock_event_mode mode, - struct clock_event_device *evt); -static void lapic_timer_broadcast(cpumask_t mask); -static void apic_pm_activate(void); - -/* - * The local apic timer can be used for any function which is CPU local. - */ -static struct clock_event_device lapic_clockevent = { - .name = "lapic", - .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT - | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY, - .shift = 32, - .set_mode = lapic_timer_setup, - .set_next_event = lapic_next_event, - .broadcast = lapic_timer_broadcast, - .rating = 100, - .irq = -1, -}; -static DEFINE_PER_CPU(struct clock_event_device, lapic_events); - -static unsigned long apic_phys; - -/* - * Get the LAPIC version - */ -static inline int lapic_get_version(void) -{ - return GET_APIC_VERSION(apic_read(APIC_LVR)); -} - -/* - * Check, if the APIC is integrated or a separate chip - */ -static inline int lapic_is_integrated(void) -{ -#ifdef CONFIG_X86_64 - return 1; -#else - return APIC_INTEGRATED(lapic_get_version()); -#endif -} - -/* - * Check, whether this is a modern or a first generation APIC - */ -static int modern_apic(void) -{ - /* AMD systems use old APIC versions, so check the CPU */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 >= 0xf) - return 1; - return lapic_get_version() >= 0x14; -} - -/* - * Paravirt kernels also might be using these below ops. So we still - * use generic apic_read()/apic_write(), which might be pointing to different - * ops in PARAVIRT case. - */ -void xapic_wait_icr_idle(void) -{ - while (apic_read(APIC_ICR) & APIC_ICR_BUSY) - cpu_relax(); -} - -u32 safe_xapic_wait_icr_idle(void) -{ - u32 send_status; - int timeout; - - timeout = 0; - do { - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - if (!send_status) - break; - udelay(100); - } while (timeout++ < 1000); - - return send_status; -} - -void xapic_icr_write(u32 low, u32 id) -{ - apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id)); - apic_write(APIC_ICR, low); -} - -u64 xapic_icr_read(void) -{ - u32 icr1, icr2; - - icr2 = apic_read(APIC_ICR2); - icr1 = apic_read(APIC_ICR); - - return icr1 | ((u64)icr2 << 32); -} - -static struct apic_ops xapic_ops = { - .read = native_apic_mem_read, - .write = native_apic_mem_write, - .icr_read = xapic_icr_read, - .icr_write = xapic_icr_write, - .wait_icr_idle = xapic_wait_icr_idle, - .safe_wait_icr_idle = safe_xapic_wait_icr_idle, -}; - -struct apic_ops __read_mostly *apic_ops = &xapic_ops; -EXPORT_SYMBOL_GPL(apic_ops); - -#ifdef HAVE_X2APIC -static void x2apic_wait_icr_idle(void) -{ - /* no need to wait for icr idle in x2apic */ - return; -} - -static u32 safe_x2apic_wait_icr_idle(void) -{ - /* no need to wait for icr idle in x2apic */ - return 0; -} - -void x2apic_icr_write(u32 low, u32 id) -{ - wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low); -} - -u64 x2apic_icr_read(void) -{ - unsigned long val; - - rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val); - return val; -} - -static struct apic_ops x2apic_ops = { - .read = native_apic_msr_read, - .write = native_apic_msr_write, - .icr_read = x2apic_icr_read, - .icr_write = x2apic_icr_write, - .wait_icr_idle = x2apic_wait_icr_idle, - .safe_wait_icr_idle = safe_x2apic_wait_icr_idle, -}; -#endif - -/** - * enable_NMI_through_LVT0 - enable NMI through local vector table 0 - */ -void __cpuinit enable_NMI_through_LVT0(void) -{ - unsigned int v; - - /* unmask and set to NMI */ - v = APIC_DM_NMI; - - /* Level triggered for 82489DX (32bit mode) */ - if (!lapic_is_integrated()) - v |= APIC_LVT_LEVEL_TRIGGER; - - apic_write(APIC_LVT0, v); -} - -#ifdef CONFIG_X86_32 -/** - * get_physical_broadcast - Get number of physical broadcast IDs - */ -int get_physical_broadcast(void) -{ - return modern_apic() ? 0xff : 0xf; -} -#endif - -/** - * lapic_get_maxlvt - get the maximum number of local vector table entries - */ -int lapic_get_maxlvt(void) -{ - unsigned int v; - - v = apic_read(APIC_LVR); - /* - * - we always have APIC integrated on 64bit mode - * - 82489DXs do not report # of LVT entries - */ - return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; -} - -/* - * Local APIC timer - */ - -/* Clock divisor */ -#ifdef CONFG_X86_64 -#define APIC_DIVISOR 1 -#else -#define APIC_DIVISOR 16 -#endif - -/* - * This function sets up the local APIC timer, with a timeout of - * 'clocks' APIC bus clock. During calibration we actually call - * this function twice on the boot CPU, once with a bogus timeout - * value, second time for real. The other (noncalibrating) CPUs - * call this function only once, with the real, calibrated value. - * - * We do reads before writes even if unnecessary, to get around the - * P5 APIC double write bug. - */ -static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) -{ - unsigned int lvtt_value, tmp_value; - - lvtt_value = LOCAL_TIMER_VECTOR; - if (!oneshot) - lvtt_value |= APIC_LVT_TIMER_PERIODIC; - if (!lapic_is_integrated()) - lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); - - if (!irqen) - lvtt_value |= APIC_LVT_MASKED; - - apic_write(APIC_LVTT, lvtt_value); - - /* - * Divide PICLK by 16 - */ - tmp_value = apic_read(APIC_TDCR); - apic_write(APIC_TDCR, - (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | - APIC_TDR_DIV_16); - - if (!oneshot) - apic_write(APIC_TMICT, clocks / APIC_DIVISOR); -} - -/* - * Setup extended LVT, AMD specific (K8, family 10h) - * - * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and - * MCE interrupts are supported. Thus MCE offset must be set to 0. - * - * If mask=1, the LVT entry does not generate interrupts while mask=0 - * enables the vector. See also the BKDGs. - */ - -#define APIC_EILVT_LVTOFF_MCE 0 -#define APIC_EILVT_LVTOFF_IBS 1 - -static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) -{ - unsigned long reg = (lvt_off << 4) + APIC_EILVT0; - unsigned int v = (mask << 16) | (msg_type << 8) | vector; - - apic_write(reg, v); -} - -u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask) -{ - setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask); - return APIC_EILVT_LVTOFF_MCE; -} - -u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) -{ - setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); - return APIC_EILVT_LVTOFF_IBS; -} -EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs); - -/* - * Program the next event, relative to now - */ -static int lapic_next_event(unsigned long delta, - struct clock_event_device *evt) -{ - apic_write(APIC_TMICT, delta); - return 0; -} - -/* - * Setup the lapic timer in periodic or oneshot mode - */ -static void lapic_timer_setup(enum clock_event_mode mode, - struct clock_event_device *evt) -{ - unsigned long flags; - unsigned int v; - - /* Lapic used as dummy for broadcast ? */ - if (evt->features & CLOCK_EVT_FEAT_DUMMY) - return; - - local_irq_save(flags); - - switch (mode) { - case CLOCK_EVT_MODE_PERIODIC: - case CLOCK_EVT_MODE_ONESHOT: - __setup_APIC_LVTT(calibration_result, - mode != CLOCK_EVT_MODE_PERIODIC, 1); - break; - case CLOCK_EVT_MODE_UNUSED: - case CLOCK_EVT_MODE_SHUTDOWN: - v = apic_read(APIC_LVTT); - v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); - apic_write(APIC_LVTT, v); - break; - case CLOCK_EVT_MODE_RESUME: - /* Nothing to do here */ - break; - } - - local_irq_restore(flags); -} - -/* - * Local APIC timer broadcast function - */ -static void lapic_timer_broadcast(cpumask_t mask) -{ -#ifdef CONFIG_SMP - send_IPI_mask(mask, LOCAL_TIMER_VECTOR); -#endif -} - -/* - * Setup the local APIC timer for this CPU. Copy the initilized values - * of the boot CPU and register the clock event in the framework. - */ -static void __cpuinit setup_APIC_timer(void) -{ - struct clock_event_device *levt = &__get_cpu_var(lapic_events); - - memcpy(levt, &lapic_clockevent, sizeof(*levt)); - levt->cpumask = cpumask_of_cpu(smp_processor_id()); - - clockevents_register_device(levt); -} - -#ifdef CONFIG_X86_64 -/* - * In this function we calibrate APIC bus clocks to the external - * timer. Unfortunately we cannot use jiffies and the timer irq - * to calibrate, since some later bootup code depends on getting - * the first irq? Ugh. - * - * We want to do the calibration only once since we - * want to have local timer irqs syncron. CPUs connected - * by the same APIC bus have the very same bus frequency. - * And we want to have irqs off anyways, no accidental - * APIC irq that way. - */ - -#define TICK_COUNT 100000000 - -static int __init calibrate_APIC_clock(void) -{ - unsigned apic, apic_start; - unsigned long tsc, tsc_start; - int result; - - local_irq_disable(); - - /* - * Put whatever arbitrary (but long enough) timeout - * value into the APIC clock, we just want to get the - * counter running for calibration. - * - * No interrupt enable ! - */ - __setup_APIC_LVTT(250000000, 0, 0); - - apic_start = apic_read(APIC_TMCCT); -#ifdef CONFIG_X86_PM_TIMER - if (apic_calibrate_pmtmr && pmtmr_ioport) { - pmtimer_wait(5000); /* 5ms wait */ - apic = apic_read(APIC_TMCCT); - result = (apic_start - apic) * 1000L / 5; - } else -#endif - { - rdtscll(tsc_start); - - do { - apic = apic_read(APIC_TMCCT); - rdtscll(tsc); - } while ((tsc - tsc_start) < TICK_COUNT && - (apic_start - apic) < TICK_COUNT); - - result = (apic_start - apic) * 1000L * tsc_khz / - (tsc - tsc_start); - } - - local_irq_enable(); - - printk(KERN_DEBUG "APIC timer calibration result %d\n", result); - - printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", - result / 1000 / 1000, result / 1000 % 1000); - - /* Calculate the scaled math multiplication factor */ - lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC, - lapic_clockevent.shift); - lapic_clockevent.max_delta_ns = - clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); - lapic_clockevent.min_delta_ns = - clockevent_delta2ns(0xF, &lapic_clockevent); - - calibration_result = (result * APIC_DIVISOR) / HZ; - - /* - * Do a sanity check on the APIC calibration result - */ - if (calibration_result < (1000000 / HZ)) { - printk(KERN_WARNING - "APIC frequency too slow, disabling apic timer\n"); - return -1; - } - - return 0; -} - -#else -/* - * In this functions we calibrate APIC bus clocks to the external timer. - * - * We want to do the calibration only once since we want to have local timer - * irqs syncron. CPUs connected by the same APIC bus have the very same bus - * frequency. - * - * This was previously done by reading the PIT/HPET and waiting for a wrap - * around to find out, that a tick has elapsed. I have a box, where the PIT - * readout is broken, so it never gets out of the wait loop again. This was - * also reported by others. - * - * Monitoring the jiffies value is inaccurate and the clockevents - * infrastructure allows us to do a simple substitution of the interrupt - * handler. - * - * The calibration routine also uses the pm_timer when possible, as the PIT - * happens to run way too slow (factor 2.3 on my VAIO CoreDuo, which goes - * back to normal later in the boot process). - */ - -#define LAPIC_CAL_LOOPS (HZ/10) - -static __initdata int lapic_cal_loops = -1; -static __initdata long lapic_cal_t1, lapic_cal_t2; -static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2; -static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2; -static __initdata unsigned long lapic_cal_j1, lapic_cal_j2; - -/* - * Temporary interrupt handler. - */ -static void __init lapic_cal_handler(struct clock_event_device *dev) -{ - unsigned long long tsc = 0; - long tapic = apic_read(APIC_TMCCT); - unsigned long pm = acpi_pm_read_early(); - - if (cpu_has_tsc) - rdtscll(tsc); - - switch (lapic_cal_loops++) { - case 0: - lapic_cal_t1 = tapic; - lapic_cal_tsc1 = tsc; - lapic_cal_pm1 = pm; - lapic_cal_j1 = jiffies; - break; - - case LAPIC_CAL_LOOPS: - lapic_cal_t2 = tapic; - lapic_cal_tsc2 = tsc; - if (pm < lapic_cal_pm1) - pm += ACPI_PM_OVRRUN; - lapic_cal_pm2 = pm; - lapic_cal_j2 = jiffies; - break; - } -} - -static int __init calibrate_APIC_clock(void) -{ - struct clock_event_device *levt = &__get_cpu_var(lapic_events); - const long pm_100ms = PMTMR_TICKS_PER_SEC/10; - const long pm_thresh = pm_100ms/100; - void (*real_handler)(struct clock_event_device *dev); - unsigned long deltaj; - long delta, deltapm; - int pm_referenced = 0; - - local_irq_disable(); - - /* Replace the global interrupt handler */ - real_handler = global_clock_event->event_handler; - global_clock_event->event_handler = lapic_cal_handler; - - /* - * Setup the APIC counter to 1e9. There is no way the lapic - * can underflow in the 100ms detection time frame - */ - __setup_APIC_LVTT(1000000000, 0, 0); - - /* Let the interrupts run */ - local_irq_enable(); - - while (lapic_cal_loops <= LAPIC_CAL_LOOPS) - cpu_relax(); - - local_irq_disable(); - - /* Restore the real event handler */ - global_clock_event->event_handler = real_handler; - - /* Build delta t1-t2 as apic timer counts down */ - delta = lapic_cal_t1 - lapic_cal_t2; - apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); - - /* Check, if the PM timer is available */ - deltapm = lapic_cal_pm2 - lapic_cal_pm1; - apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); - - if (deltapm) { - unsigned long mult; - u64 res; - - mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22); - - if (deltapm > (pm_100ms - pm_thresh) && - deltapm < (pm_100ms + pm_thresh)) { - apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); - } else { - res = (((u64) deltapm) * mult) >> 22; - do_div(res, 1000000); - printk(KERN_WARNING "APIC calibration not consistent " - "with PM Timer: %ldms instead of 100ms\n", - (long)res); - /* Correct the lapic counter value */ - res = (((u64) delta) * pm_100ms); - do_div(res, deltapm); - printk(KERN_INFO "APIC delta adjusted to PM-Timer: " - "%lu (%ld)\n", (unsigned long) res, delta); - delta = (long) res; - } - pm_referenced = 1; - } - - /* Calculate the scaled math multiplication factor */ - lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, - lapic_clockevent.shift); - lapic_clockevent.max_delta_ns = - clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); - lapic_clockevent.min_delta_ns = - clockevent_delta2ns(0xF, &lapic_clockevent); - - calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; - - apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); - apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult); - apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", - calibration_result); - - if (cpu_has_tsc) { - delta = (long)(lapic_cal_tsc2 - lapic_cal_tsc1); - apic_printk(APIC_VERBOSE, "..... CPU clock speed is " - "%ld.%04ld MHz.\n", - (delta / LAPIC_CAL_LOOPS) / (1000000 / HZ), - (delta / LAPIC_CAL_LOOPS) % (1000000 / HZ)); - } - - apic_printk(APIC_VERBOSE, "..... host bus clock speed is " - "%u.%04u MHz.\n", - calibration_result / (1000000 / HZ), - calibration_result % (1000000 / HZ)); - - /* - * Do a sanity check on the APIC calibration result - */ - if (calibration_result < (1000000 / HZ)) { - local_irq_enable(); - printk(KERN_WARNING - "APIC frequency too slow, disabling apic timer\n"); - return -1; - } - - levt->features &= ~CLOCK_EVT_FEAT_DUMMY; - - /* We trust the pm timer based calibration */ - if (!pm_referenced) { - apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); - - /* - * Setup the apic timer manually - */ - levt->event_handler = lapic_cal_handler; - lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt); - lapic_cal_loops = -1; - - /* Let the interrupts run */ - local_irq_enable(); - - while (lapic_cal_loops <= LAPIC_CAL_LOOPS) - cpu_relax(); - - local_irq_disable(); - - /* Stop the lapic timer */ - lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt); - - local_irq_enable(); - - /* Jiffies delta */ - deltaj = lapic_cal_j2 - lapic_cal_j1; - apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj); - - /* Check, if the jiffies result is consistent */ - if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) - apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); - else - levt->features |= CLOCK_EVT_FEAT_DUMMY; - } else - local_irq_enable(); - - if (levt->features & CLOCK_EVT_FEAT_DUMMY) { - printk(KERN_WARNING - "APIC timer disabled due to verification failure.\n"); - return -1; - } - - return 0; -} - -#endif - -/* - * Setup the boot APIC - * - * Calibrate and verify the result. - */ -void __init setup_boot_APIC_clock(void) -{ - /* - * The local apic timer can be disabled via the kernel - * commandline or from the CPU detection code. Register the lapic - * timer as a dummy clock event source on SMP systems, so the - * broadcast mechanism is used. On UP systems simply ignore it. - */ - if (disable_apic_timer) { - printk(KERN_INFO "Disabling APIC timer\n"); - /* No broadcast on UP ! */ - if (num_possible_cpus() > 1) { - lapic_clockevent.mult = 1; - setup_APIC_timer(); - } - return; - } - - apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" - "calibrating APIC timer ...\n"); - - if (calibrate_APIC_clock()) { - /* No broadcast on UP ! */ - if (num_possible_cpus() > 1) - setup_APIC_timer(); - return; - } - - /* - * If nmi_watchdog is set to IO_APIC, we need the - * PIT/HPET going. Otherwise register lapic as a dummy - * device. - */ - if (nmi_watchdog != NMI_IO_APIC) - lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; - else - printk(KERN_WARNING "APIC timer registered as dummy," - " due to nmi_watchdog=%d!\n", nmi_watchdog); - - /* Setup the lapic or request the broadcast */ - setup_APIC_timer(); -} - -void __cpuinit setup_secondary_APIC_clock(void) -{ - setup_APIC_timer(); -} - -/* - * The guts of the apic timer interrupt - */ -static void local_apic_timer_interrupt(void) -{ - int cpu = smp_processor_id(); - struct clock_event_device *evt = &per_cpu(lapic_events, cpu); - - /* - * Normally we should not be here till LAPIC has been initialized but - * in some cases like kdump, its possible that there is a pending LAPIC - * timer interrupt from previous kernel's context and is delivered in - * new kernel the moment interrupts are enabled. - * - * Interrupts are enabled early and LAPIC is setup much later, hence - * its possible that when we get here evt->event_handler is NULL. - * Check for event_handler being NULL and discard the interrupt as - * spurious. - */ - if (!evt->event_handler) { - printk(KERN_WARNING - "Spurious LAPIC timer interrupt on cpu %d\n", cpu); - /* Switch it off */ - lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt); - return; - } - - /* - * the NMI deadlock-detector uses this. - */ -#ifdef CONFIG_X86_64 - add_pda(apic_timer_irqs, 1); -#else - per_cpu(irq_stat, cpu).apic_timer_irqs++; -#endif - - evt->event_handler(evt); -} - -/* - * Local APIC timer interrupt. This is the most natural way for doing - * local interrupts, but local timer interrupts can be emulated by - * broadcast interrupts too. [in case the hw doesn't support APIC timers] - * - * [ if a single-CPU system runs an SMP kernel then we call the local - * interrupt as well. Thus we cannot inline the local irq ... ] - */ -void smp_apic_timer_interrupt(struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - - /* - * NOTE! We'd better ACK the irq immediately, - * because timer handling can be slow. - */ - ack_APIC_irq(); - /* - * update_process_times() expects us to have done irq_enter(). - * Besides, if we don't timer interrupts ignore the global - * interrupt lock, which is the WrongThing (tm) to do. - */ -#ifdef CONFIG_X86_64 - exit_idle(); -#endif - irq_enter(); - local_apic_timer_interrupt(); - irq_exit(); - - set_irq_regs(old_regs); -} - -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - -/* - * Local APIC start and shutdown - */ - -/** - * clear_local_APIC - shutdown the local APIC - * - * This is called, when a CPU is disabled and before rebooting, so the state of - * the local APIC has no dangling leftovers. Also used to cleanout any BIOS - * leftovers during boot. - */ -void clear_local_APIC(void) -{ - int maxlvt; - u32 v; - - /* APIC hasn't been mapped yet */ - if (!apic_phys) - return; - - maxlvt = lapic_get_maxlvt(); - /* - * Masking an LVT entry can trigger a local APIC error - * if the vector is zero. Mask LVTERR first to prevent this. - */ - if (maxlvt >= 3) { - v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ - apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); - } - /* - * Careful: we have to set masks only first to deassert - * any level-triggered sources. - */ - v = apic_read(APIC_LVTT); - apic_write(APIC_LVTT, v | APIC_LVT_MASKED); - v = apic_read(APIC_LVT0); - apic_write(APIC_LVT0, v | APIC_LVT_MASKED); - v = apic_read(APIC_LVT1); - apic_write(APIC_LVT1, v | APIC_LVT_MASKED); - if (maxlvt >= 4) { - v = apic_read(APIC_LVTPC); - apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); - } - - /* lets not touch this if we didn't frob it */ -#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL) - if (maxlvt >= 5) { - v = apic_read(APIC_LVTTHMR); - apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); - } -#endif - /* - * Clean APIC state for other OSs: - */ - apic_write(APIC_LVTT, APIC_LVT_MASKED); - apic_write(APIC_LVT0, APIC_LVT_MASKED); - apic_write(APIC_LVT1, APIC_LVT_MASKED); - if (maxlvt >= 3) - apic_write(APIC_LVTERR, APIC_LVT_MASKED); - if (maxlvt >= 4) - apic_write(APIC_LVTPC, APIC_LVT_MASKED); - - /* Integrated APIC (!82489DX) ? */ - if (lapic_is_integrated()) { - if (maxlvt > 3) - /* Clear ESR due to Pentium errata 3AP and 11AP */ - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - } -} - -/** - * disable_local_APIC - clear and disable the local APIC - */ -void disable_local_APIC(void) -{ - unsigned int value; - - clear_local_APIC(); - - /* - * Disable APIC (implies clearing of registers - * for 82489DX!). - */ - value = apic_read(APIC_SPIV); - value &= ~APIC_SPIV_APIC_ENABLED; - apic_write(APIC_SPIV, value); - -#ifdef CONFIG_X86_32 - /* - * When LAPIC was disabled by the BIOS and enabled by the kernel, - * restore the disabled state. - */ - if (enabled_via_apicbase) { - unsigned int l, h; - - rdmsr(MSR_IA32_APICBASE, l, h); - l &= ~MSR_IA32_APICBASE_ENABLE; - wrmsr(MSR_IA32_APICBASE, l, h); - } -#endif -} - -/* - * If Linux enabled the LAPIC against the BIOS default disable it down before - * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and - * not power-off. Additionally clear all LVT entries before disable_local_APIC - * for the case where Linux didn't enable the LAPIC. - */ -void lapic_shutdown(void) -{ - unsigned long flags; - - if (!cpu_has_apic) - return; - - local_irq_save(flags); - -#ifdef CONFIG_X86_32 - if (!enabled_via_apicbase) - clear_local_APIC(); - else -#endif - disable_local_APIC(); - - - local_irq_restore(flags); -} - -/* - * This is to verify that we're looking at a real local APIC. - * Check these against your board if the CPUs aren't getting - * started for no apparent reason. - */ -int __init verify_local_APIC(void) -{ - unsigned int reg0, reg1; - - /* - * The version register is read-only in a real APIC. - */ - reg0 = apic_read(APIC_LVR); - apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0); - apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK); - reg1 = apic_read(APIC_LVR); - apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1); - - /* - * The two version reads above should print the same - * numbers. If the second one is different, then we - * poke at a non-APIC. - */ - if (reg1 != reg0) - return 0; - - /* - * Check if the version looks reasonably. - */ - reg1 = GET_APIC_VERSION(reg0); - if (reg1 == 0x00 || reg1 == 0xff) - return 0; - reg1 = lapic_get_maxlvt(); - if (reg1 < 0x02 || reg1 == 0xff) - return 0; - - /* - * The ID register is read/write in a real APIC. - */ - reg0 = apic_read(APIC_ID); - apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); - apic_write(APIC_ID, reg0 ^ APIC_ID_MASK); - reg1 = apic_read(APIC_ID); - apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); - apic_write(APIC_ID, reg0); - if (reg1 != (reg0 ^ APIC_ID_MASK)) - return 0; - - /* - * The next two are just to see if we have sane values. - * They're only really relevant if we're in Virtual Wire - * compatibility mode, but most boxes are anymore. - */ - reg0 = apic_read(APIC_LVT0); - apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0); - reg1 = apic_read(APIC_LVT1); - apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); - - return 1; -} - -/** - * sync_Arb_IDs - synchronize APIC bus arbitration IDs - */ -void __init sync_Arb_IDs(void) -{ - /* - * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not - * needed on AMD. - */ - if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD) - return; - - /* - * Wait for idle. - */ - apic_wait_icr_idle(); - - apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); - apic_write(APIC_ICR, APIC_DEST_ALLINC | - APIC_INT_LEVELTRIG | APIC_DM_INIT); -} - -/* - * An initial setup of the virtual wire mode. - */ -void __init init_bsp_APIC(void) -{ - unsigned int value; - - /* - * Don't do the setup now if we have a SMP BIOS as the - * through-I/O-APIC virtual wire mode might be active. - */ - if (smp_found_config || !cpu_has_apic) - return; - - /* - * Do not trust the local APIC being empty at bootup. - */ - clear_local_APIC(); - - /* - * Enable APIC. - */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - -#ifdef CONFIG_X86_32 - /* This bit is reserved on P4/Xeon and should be cleared */ - if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && - (boot_cpu_data.x86 == 15)) - value &= ~APIC_SPIV_FOCUS_DISABLED; - else -#endif - value |= APIC_SPIV_FOCUS_DISABLED; - value |= SPURIOUS_APIC_VECTOR; - apic_write(APIC_SPIV, value); - - /* - * Set up the virtual wire mode. - */ - apic_write(APIC_LVT0, APIC_DM_EXTINT); - value = APIC_DM_NMI; - if (!lapic_is_integrated()) /* 82489DX */ - value |= APIC_LVT_LEVEL_TRIGGER; - apic_write(APIC_LVT1, value); -} - -static void __cpuinit lapic_setup_esr(void) -{ - unsigned long oldvalue, value, maxlvt; - if (lapic_is_integrated() && !esr_disable) { - if (esr_disable) { - /* - * Something untraceable is creating bad interrupts on - * secondary quads ... for the moment, just leave the - * ESR disabled - we can't do anything useful with the - * errors anyway - mbligh - */ - printk(KERN_INFO "Leaving ESR disabled.\n"); - return; - } - /* !82489DX */ - maxlvt = lapic_get_maxlvt(); - if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ - apic_write(APIC_ESR, 0); - oldvalue = apic_read(APIC_ESR); - - /* enables sending errors */ - value = ERROR_APIC_VECTOR; - apic_write(APIC_LVTERR, value); - /* - * spec says clear errors after enabling vector. - */ - if (maxlvt > 3) - apic_write(APIC_ESR, 0); - value = apic_read(APIC_ESR); - if (value != oldvalue) - apic_printk(APIC_VERBOSE, "ESR value before enabling " - "vector: 0x%08lx after: 0x%08lx\n", - oldvalue, value); - } else { - printk(KERN_INFO "No ESR for 82489DX.\n"); - } -} - - -/** - * setup_local_APIC - setup the local APIC - */ -void __cpuinit setup_local_APIC(void) -{ - unsigned int value; - int i, j; - -#ifdef CONFIG_X86_32 - /* Pound the ESR really hard over the head with a big hammer - mbligh */ - if (esr_disable) { - apic_write(APIC_ESR, 0); - apic_write(APIC_ESR, 0); - apic_write(APIC_ESR, 0); - apic_write(APIC_ESR, 0); - } -#endif - - preempt_disable(); - - /* - * Double-check whether this APIC is really registered. - * This is meaningless in clustered apic mode, so we skip it. - */ - if (!apic_id_registered()) - BUG(); - - /* - * Intel recommends to set DFR, LDR and TPR before enabling - * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel - * document number 292116). So here it goes... - */ - init_apic_ldr(); - - /* - * Set Task Priority to 'accept all'. We never change this - * later on. - */ - value = apic_read(APIC_TASKPRI); - value &= ~APIC_TPRI_MASK; - apic_write(APIC_TASKPRI, value); - - /* - * After a crash, we no longer service the interrupts and a pending - * interrupt from previous kernel might still have ISR bit set. - * - * Most probably by now CPU has serviced that pending interrupt and - * it might not have done the ack_APIC_irq() because it thought, - * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it - * does not clear the ISR bit and cpu thinks it has already serivced - * the interrupt. Hence a vector might get locked. It was noticed - * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. - */ - for (i = APIC_ISR_NR - 1; i >= 0; i--) { - value = apic_read(APIC_ISR + i*0x10); - for (j = 31; j >= 0; j--) { - if (value & (1< 1) || - (boot_cpu_data.x86 == 15)) - break; - goto no_apic; - case X86_VENDOR_INTEL: - if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 || - (boot_cpu_data.x86 == 5 && cpu_has_apic)) - break; - goto no_apic; - default: - goto no_apic; - } - - if (!cpu_has_apic) { - /* - * Over-ride BIOS and try to enable the local APIC only if - * "lapic" specified. - */ - if (!force_enable_local_apic) { - printk(KERN_INFO "Local APIC disabled by BIOS -- " - "you can enable it with \"lapic\"\n"); - return -1; - } - /* - * Some BIOSes disable the local APIC in the APIC_BASE - * MSR. This can only be done in software for Intel P6 or later - * and AMD K7 (Model > 1) or later. - */ - rdmsr(MSR_IA32_APICBASE, l, h); - if (!(l & MSR_IA32_APICBASE_ENABLE)) { - printk(KERN_INFO - "Local APIC disabled by BIOS -- reenabling.\n"); - l &= ~MSR_IA32_APICBASE_BASE; - l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; - wrmsr(MSR_IA32_APICBASE, l, h); - enabled_via_apicbase = 1; - } - } - /* - * The APIC feature bit should now be enabled - * in `cpuid' - */ - features = cpuid_edx(1); - if (!(features & (1 << X86_FEATURE_APIC))) { - printk(KERN_WARNING "Could not enable APIC!\n"); - return -1; - } - set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - - /* The BIOS may have set up the APIC at some other address */ - rdmsr(MSR_IA32_APICBASE, l, h); - if (l & MSR_IA32_APICBASE_ENABLE) - mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; - - printk(KERN_INFO "Found and enabled local APIC!\n"); - - apic_pm_activate(); - - return 0; - -no_apic: - printk(KERN_INFO "No local APIC present or hardware disabled\n"); - return -1; -} -#endif - -#ifdef CONFIG_X86_64 -void __init early_init_lapic_mapping(void) -{ - unsigned long phys_addr; - - /* - * If no local APIC can be found then go out - * : it means there is no mpatable and MADT - */ - if (!smp_found_config) - return; - - phys_addr = mp_lapic_addr; - - set_fixmap_nocache(FIX_APIC_BASE, phys_addr); - apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", - APIC_BASE, phys_addr); - - /* - * Fetch the APIC ID of the BSP in case we have a - * default configuration (or the MP table is broken). - */ - boot_cpu_physical_apicid = read_apic_id(); -} -#endif - -/** - * init_apic_mappings - initialize APIC mappings - */ -void __init init_apic_mappings(void) -{ -#ifdef HAVE_X2APIC - if (x2apic) { - boot_cpu_physical_apicid = read_apic_id(); - return; - } -#endif - - /* - * If no local APIC can be found then set up a fake all - * zeroes page to simulate the local APIC and another - * one for the IO-APIC. - */ - if (!smp_found_config && detect_init_APIC()) { - apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); - apic_phys = __pa(apic_phys); - } else - apic_phys = mp_lapic_addr; - - set_fixmap_nocache(FIX_APIC_BASE, apic_phys); - apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", - APIC_BASE, apic_phys); - - /* - * Fetch the APIC ID of the BSP in case we have a - * default configuration (or the MP table is broken). - */ - if (boot_cpu_physical_apicid == -1U) - boot_cpu_physical_apicid = read_apic_id(); -} - -/* - * This initializes the IO-APIC and APIC hardware if this is - * a UP kernel. - */ -int apic_version[MAX_APICS]; - -int __init APIC_init_uniprocessor(void) -{ -#ifdef CONFIG_X86_64 - if (disable_apic) { - printk(KERN_INFO "Apic disabled\n"); - return -1; - } - if (!cpu_has_apic) { - disable_apic = 1; - printk(KERN_INFO "Apic disabled by BIOS\n"); - return -1; - } -#else - if (!smp_found_config && !cpu_has_apic) - return -1; - - /* - * Complain if the BIOS pretends there is one. - */ - if (!cpu_has_apic && - APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { - printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", - boot_cpu_physical_apicid); - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); - return -1; - } -#endif - -#ifdef HAVE_X2APIC - enable_IR_x2apic(); -#endif -#ifdef CONFIG_X86_64 - setup_apic_routing(); -#endif - - verify_local_APIC(); - connect_bsp_APIC(); - -#ifdef CONFIG_X86_64 - apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid)); -#else - /* - * Hack: In case of kdump, after a crash, kernel might be booting - * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid - * might be zero if read from MP tables. Get it from LAPIC. - */ -# ifdef CONFIG_CRASH_DUMP - boot_cpu_physical_apicid = read_apic_id(); -# endif -#endif - physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); - setup_local_APIC(); - -#ifdef CONFIG_X86_64 - /* - * Now enable IO-APICs, actually call clear_IO_APIC - * We need clear_IO_APIC before enabling vector on BP - */ - if (!skip_ioapic_setup && nr_ioapics) - enable_IO_APIC(); -#endif - -#ifdef CONFIG_X86_IO_APIC - if (!smp_found_config || skip_ioapic_setup || !nr_ioapics) -#endif - localise_nmi_watchdog(); - end_local_APIC_setup(); - -#ifdef CONFIG_X86_IO_APIC - if (smp_found_config && !skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); -# ifdef CONFIG_X86_64 - else - nr_ioapics = 0; -# endif -#endif - -#ifdef CONFIG_X86_64 - setup_boot_APIC_clock(); - check_nmi_watchdog(); -#else - setup_boot_clock(); -#endif - - return 0; -} - -/* - * Local APIC interrupts - */ - -/* - * This interrupt should _never_ happen with our APIC/SMP architecture - */ -#ifdef CONFIG_X86_64 -asmlinkage void smp_spurious_interrupt(void) -#else -void smp_spurious_interrupt(struct pt_regs *regs) -#endif -{ - u32 v; - -#ifdef CONFIG_X86_64 - exit_idle(); -#endif - irq_enter(); - /* - * Check if this really is a spurious interrupt and ACK it - * if it is a vectored one. Just in case... - * Spurious interrupts should not be ACKed. - */ - v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); - if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) - ack_APIC_irq(); - -#ifdef CONFIG_X86_64 - add_pda(irq_spurious_count, 1); -#else - /* see sw-dev-man vol 3, chapter 7.4.13.5 */ - printk(KERN_INFO "spurious APIC interrupt on CPU#%d, " - "should never happen.\n", smp_processor_id()); - __get_cpu_var(irq_stat).irq_spurious_count++; -#endif - irq_exit(); -} - -/* - * This interrupt should never happen with our APIC/SMP architecture - */ -#ifdef CONFIG_X86_64 -asmlinkage void smp_error_interrupt(void) -#else -void smp_error_interrupt(struct pt_regs *regs) -#endif -{ - u32 v, v1; - -#ifdef CONFIG_X86_64 - exit_idle(); -#endif - irq_enter(); - /* First tickle the hardware, only then report what went on. -- REW */ - v = apic_read(APIC_ESR); - apic_write(APIC_ESR, 0); - v1 = apic_read(APIC_ESR); - ack_APIC_irq(); - atomic_inc(&irq_err_count); - - /* Here is what the APIC error bits mean: - 0: Send CS error - 1: Receive CS error - 2: Send accept error - 3: Receive accept error - 4: Reserved - 5: Send illegal vector - 6: Received illegal vector - 7: Illegal register address - */ - printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", - smp_processor_id(), v , v1); - irq_exit(); -} - -/** - * connect_bsp_APIC - attach the APIC to the interrupt system - */ -void __init connect_bsp_APIC(void) -{ -#ifdef CONFIG_X86_32 - if (pic_mode) { - /* - * Do not trust the local APIC being empty at bootup. - */ - clear_local_APIC(); - /* - * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's - * local APIC to INT and NMI lines. - */ - apic_printk(APIC_VERBOSE, "leaving PIC mode, " - "enabling APIC mode.\n"); - outb(0x70, 0x22); - outb(0x01, 0x23); - } -#endif - enable_apic_mode(); -} - -/** - * disconnect_bsp_APIC - detach the APIC from the interrupt system - * @virt_wire_setup: indicates, whether virtual wire mode is selected - * - * Virtual wire mode is necessary to deliver legacy interrupts even when the - * APIC is disabled. - */ -void disconnect_bsp_APIC(int virt_wire_setup) -{ - unsigned int value; - -#ifdef CONFIG_X86_32 - if (pic_mode) { - /* - * Put the board back into PIC mode (has an effect only on - * certain older boards). Note that APIC interrupts, including - * IPIs, won't work beyond this point! The only exception are - * INIT IPIs. - */ - apic_printk(APIC_VERBOSE, "disabling APIC mode, " - "entering PIC mode.\n"); - outb(0x70, 0x22); - outb(0x00, 0x23); - return; - } -#endif - - /* Go back to Virtual Wire compatibility mode */ - - /* For the spurious interrupt use vector F, and enable it */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - value |= 0xf; - apic_write(APIC_SPIV, value); - - if (!virt_wire_setup) { - /* - * For LVT0 make it edge triggered, active high, - * external and enabled - */ - value = apic_read(APIC_LVT0); - value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); - apic_write(APIC_LVT0, value); - } else { - /* Disable LVT0 */ - apic_write(APIC_LVT0, APIC_LVT_MASKED); - } - - /* - * For LVT1 make it edge triggered, active high, - * nmi and enabled - */ - value = apic_read(APIC_LVT1); - value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); - apic_write(APIC_LVT1, value); -} - -void __cpuinit generic_processor_info(int apicid, int version) -{ - int cpu; - cpumask_t tmp_map; - - /* - * Validate version - */ - if (version == 0x0) { - printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! " - "fixing up to 0x10. (tell your hw vendor)\n", - version); - version = 0x10; - } - apic_version[apicid] = version; - - if (num_processors >= NR_CPUS) { - printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." - " Processor ignored.\n", NR_CPUS); - return; - } - - num_processors++; - cpus_complement(tmp_map, cpu_present_map); - cpu = first_cpu(tmp_map); - - physid_set(apicid, phys_cpu_present_map); - if (apicid == boot_cpu_physical_apicid) { - /* - * x86_bios_cpu_apicid is required to have processors listed - * in same order as logical cpu numbers. Hence the first - * entry is BSP, and so on. - */ - cpu = 0; - } - if (apicid > max_physical_apicid) - max_physical_apicid = apicid; - -#ifdef CONFIG_X86_32 - /* - * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y - * but we need to work other dependencies like SMP_SUSPEND etc - * before this can be done without some confusion. - * if (CPU_HOTPLUG_ENABLED || num_processors > 8) - * - Ashok Raj - */ - if (max_physical_apicid >= 8) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - if (!APIC_XAPIC(version)) { - def_to_bigsmp = 0; - break; - } - /* If P4 and above fall through */ - case X86_VENDOR_AMD: - def_to_bigsmp = 1; - } - } -#endif - -#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64) - /* are we being called early in kernel startup? */ - if (early_per_cpu_ptr(x86_cpu_to_apicid)) { - u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); - u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); - - cpu_to_apicid[cpu] = apicid; - bios_cpu_apicid[cpu] = apicid; - } else { - per_cpu(x86_cpu_to_apicid, cpu) = apicid; - per_cpu(x86_bios_cpu_apicid, cpu) = apicid; - } -#endif - - cpu_set(cpu, cpu_possible_map); - cpu_set(cpu, cpu_present_map); -} - -#ifdef CONFIG_X86_64 -int hard_smp_processor_id(void) -{ - return read_apic_id(); -} -#endif - -/* - * Power management - */ -#ifdef CONFIG_PM - -static struct { - /* - * 'active' is true if the local APIC was enabled by us and - * not the BIOS; this signifies that we are also responsible - * for disabling it before entering apm/acpi suspend - */ - int active; - /* r/w apic fields */ - unsigned int apic_id; - unsigned int apic_taskpri; - unsigned int apic_ldr; - unsigned int apic_dfr; - unsigned int apic_spiv; - unsigned int apic_lvtt; - unsigned int apic_lvtpc; - unsigned int apic_lvt0; - unsigned int apic_lvt1; - unsigned int apic_lvterr; - unsigned int apic_tmict; - unsigned int apic_tdcr; - unsigned int apic_thmr; -} apic_pm_state; - -static int lapic_suspend(struct sys_device *dev, pm_message_t state) -{ - unsigned long flags; - int maxlvt; - - if (!apic_pm_state.active) - return 0; - - maxlvt = lapic_get_maxlvt(); - - apic_pm_state.apic_id = apic_read(APIC_ID); - apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); - apic_pm_state.apic_ldr = apic_read(APIC_LDR); - apic_pm_state.apic_dfr = apic_read(APIC_DFR); - apic_pm_state.apic_spiv = apic_read(APIC_SPIV); - apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); - if (maxlvt >= 4) - apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); - apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); - apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); - apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); - apic_pm_state.apic_tmict = apic_read(APIC_TMICT); - apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); -#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) - if (maxlvt >= 5) - apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); -#endif - - local_irq_save(flags); - disable_local_APIC(); - local_irq_restore(flags); - return 0; -} - -static int lapic_resume(struct sys_device *dev) -{ - unsigned int l, h; - unsigned long flags; - int maxlvt; - - if (!apic_pm_state.active) - return 0; - - maxlvt = lapic_get_maxlvt(); - - local_irq_save(flags); - -#ifdef HAVE_X2APIC - if (x2apic) - enable_x2apic(); - else -#endif - { - /* - * Make sure the APICBASE points to the right address - * - * FIXME! This will be wrong if we ever support suspend on - * SMP! We'll need to do this as part of the CPU restore! - */ - rdmsr(MSR_IA32_APICBASE, l, h); - l &= ~MSR_IA32_APICBASE_BASE; - l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; - wrmsr(MSR_IA32_APICBASE, l, h); - } - - apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); - apic_write(APIC_ID, apic_pm_state.apic_id); - apic_write(APIC_DFR, apic_pm_state.apic_dfr); - apic_write(APIC_LDR, apic_pm_state.apic_ldr); - apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); - apic_write(APIC_SPIV, apic_pm_state.apic_spiv); - apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); - apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); -#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) - if (maxlvt >= 5) - apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); -#endif - if (maxlvt >= 4) - apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); - apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); - apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); - apic_write(APIC_TMICT, apic_pm_state.apic_tmict); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - - local_irq_restore(flags); - - return 0; -} - -/* - * This device has no shutdown method - fully functioning local APICs - * are needed on every CPU up until machine_halt/restart/poweroff. - */ - -static struct sysdev_class lapic_sysclass = { - .name = "lapic", - .resume = lapic_resume, - .suspend = lapic_suspend, -}; - -static struct sys_device device_lapic = { - .id = 0, - .cls = &lapic_sysclass, -}; - -static void __cpuinit apic_pm_activate(void) -{ - apic_pm_state.active = 1; -} - -static int __init init_lapic_sysfs(void) -{ - int error; - - if (!cpu_has_apic) - return 0; - /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ - - error = sysdev_class_register(&lapic_sysclass); - if (!error) - error = sysdev_register(&device_lapic); - return error; -} -device_initcall(init_lapic_sysfs); - -#else /* CONFIG_PM */ - -static void apic_pm_activate(void) { } - -#endif /* CONFIG_PM */ - -#ifdef CONFIG_X86_64 -/* - * apic_is_clustered_box() -- Check if we can expect good TSC - * - * Thus far, the major user of this is IBM's Summit2 series: - * - * Clustered boxes may have unsynced TSC problems if they are - * multi-chassis. Use available data to take a good guess. - * If in doubt, go HPET. - */ -__cpuinit int apic_is_clustered_box(void) -{ - int i, clusters, zeros; - unsigned id; - u16 *bios_cpu_apicid; - DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); - - /* - * there is not this kind of box with AMD CPU yet. - * Some AMD box with quadcore cpu and 8 sockets apicid - * will be [4, 0x23] or [8, 0x27] could be thought to - * vsmp box still need checking... - */ - if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box()) - return 0; - - bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); - bitmap_zero(clustermap, NUM_APIC_CLUSTERS); - - for (i = 0; i < NR_CPUS; i++) { - /* are we being called early in kernel startup? */ - if (bios_cpu_apicid) { - id = bios_cpu_apicid[i]; - } - else if (i < nr_cpu_ids) { - if (cpu_present(i)) - id = per_cpu(x86_bios_cpu_apicid, i); - else - continue; - } - else - break; - - if (id != BAD_APICID) - __set_bit(APIC_CLUSTERID(id), clustermap); - } - - /* Problem: Partially populated chassis may not have CPUs in some of - * the APIC clusters they have been allocated. Only present CPUs have - * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap. - * Since clusters are allocated sequentially, count zeros only if - * they are bounded by ones. - */ - clusters = 0; - zeros = 0; - for (i = 0; i < NUM_APIC_CLUSTERS; i++) { - if (test_bit(i, clustermap)) { - clusters += 1 + zeros; - zeros = 0; - } else - ++zeros; - } - - /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are - * not guaranteed to be synced between boards - */ - if (is_vsmp_box() && clusters > 1) - return 1; - - /* - * If clusters > 2, then should be multi-chassis. - * May have to revisit this when multi-core + hyperthreaded CPUs come - * out, but AFAIK this will work even for them. - */ - return (clusters > 2); -} -#endif - -/* - * APIC command line parameters - */ -static int __init setup_disableapic(char *arg) -{ - disable_apic = 1; - setup_clear_cpu_cap(X86_FEATURE_APIC); - return 0; -} -early_param("disableapic", setup_disableapic); - -/* same as disableapic, for compatibility */ -static int __init setup_nolapic(char *arg) -{ - return setup_disableapic(arg); -} -early_param("nolapic", setup_nolapic); - -static int __init parse_lapic_timer_c2_ok(char *arg) -{ - local_apic_timer_c2_ok = 1; - return 0; -} -early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); - -static int __init parse_disable_apic_timer(char *arg) -{ - disable_apic_timer = 1; - return 0; -} -early_param("noapictimer", parse_disable_apic_timer); - -static int __init parse_nolapic_timer(char *arg) -{ - disable_apic_timer = 1; - return 0; -} -early_param("nolapic_timer", parse_nolapic_timer); - -static int __init apic_set_verbosity(char *arg) -{ - if (!arg) { -#ifdef CONFIG_X86_64 - skip_ioapic_setup = 0; - ioapic_force = 1; - return 0; -#endif - return -EINVAL; - } - - if (strcmp("debug", arg) == 0) - apic_verbosity = APIC_DEBUG; - else if (strcmp("verbose", arg) == 0) - apic_verbosity = APIC_VERBOSE; - else { - printk(KERN_WARNING "APIC Verbosity level %s not recognised" - " use apic=verbose or apic=debug\n", arg); - return -EINVAL; - } - - return 0; -} -early_param("apic", apic_set_verbosity); - -static int __init lapic_insert_resource(void) -{ - if (!apic_phys) - return -1; - - /* Put local APIC into the resource map. */ - lapic_resource.start = apic_phys; - lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; - insert_resource(&iomem_resource, &lapic_resource); - - return 0; -} - -/* - * need call insert after e820_reserve_resources() - * that is using request_resource - */ -late_initcall(lapic_insert_resource); diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c deleted file mode 100644 index 6e99af5ce67..00000000000 --- a/arch/x86/kernel/apic_64.c +++ /dev/null @@ -1,2311 +0,0 @@ -/* - * Local APIC handling, local APIC timers - * - * (c) 1999, 2000 Ingo Molnar - * - * Fixes - * Maciej W. Rozycki : Bits for genuine 82489DX APICs; - * thanks to Eric Gilmore - * and Rolf G. Tews - * for testing these extensively. - * Maciej W. Rozycki : Various updates and fixes. - * Mikael Pettersson : Power Management for UP-APIC. - * Pavel Machek and - * Mikael Pettersson : PM converted to driver model. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -/* - * Sanity check - */ -#if ((SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F) -# error SPURIOUS_APIC_VECTOR definition error -#endif - -#ifdef CONFIG_X86_32 -/* - * Knob to control our willingness to enable the local APIC. - * - * +1=force-enable - */ -static int force_enable_local_apic; -/* - * APIC command line parameters - */ -static int __init parse_lapic(char *arg) -{ - force_enable_local_apic = 1; - return 0; -} -early_param("lapic", parse_lapic); -/* Local APIC was disabled by the BIOS and enabled by the kernel */ -static int enabled_via_apicbase; - -#endif - -#ifdef CONFIG_X86_64 -static int apic_calibrate_pmtmr __initdata; -static __init int setup_apicpmtimer(char *s) -{ - apic_calibrate_pmtmr = 1; - notsc_setup(NULL); - return 0; -} -__setup("apicpmtimer", setup_apicpmtimer); -#endif - -#ifdef CONFIG_X86_64 -#define HAVE_X2APIC -#endif - -#ifdef HAVE_X2APIC -int x2apic; -/* x2apic enabled before OS handover */ -int x2apic_preenabled; -int disable_x2apic; -static __init int setup_nox2apic(char *str) -{ - disable_x2apic = 1; - setup_clear_cpu_cap(X86_FEATURE_X2APIC); - return 0; -} -early_param("nox2apic", setup_nox2apic); -#endif - -unsigned long mp_lapic_addr; -int disable_apic; -/* Disable local APIC timer from the kernel commandline or via dmi quirk */ -static int disable_apic_timer __cpuinitdata; -/* Local APIC timer works in C2 */ -int local_apic_timer_c2_ok; -EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); - -int first_system_vector = 0xfe; - -char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; - -/* - * Debug level, exported for io_apic.c - */ -unsigned int apic_verbosity; - -int pic_mode; - -/* Have we found an MP table */ -int smp_found_config; - -static struct resource lapic_resource = { - .name = "Local APIC", - .flags = IORESOURCE_MEM | IORESOURCE_BUSY, -}; - -static unsigned int calibration_result; - -static int lapic_next_event(unsigned long delta, - struct clock_event_device *evt); -static void lapic_timer_setup(enum clock_event_mode mode, - struct clock_event_device *evt); -static void lapic_timer_broadcast(cpumask_t mask); -static void apic_pm_activate(void); - -/* - * The local apic timer can be used for any function which is CPU local. - */ -static struct clock_event_device lapic_clockevent = { - .name = "lapic", - .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT - | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY, - .shift = 32, - .set_mode = lapic_timer_setup, - .set_next_event = lapic_next_event, - .broadcast = lapic_timer_broadcast, - .rating = 100, - .irq = -1, -}; -static DEFINE_PER_CPU(struct clock_event_device, lapic_events); - -static unsigned long apic_phys; - -/* - * Get the LAPIC version - */ -static inline int lapic_get_version(void) -{ - return GET_APIC_VERSION(apic_read(APIC_LVR)); -} - -/* - * Check, if the APIC is integrated or a separate chip - */ -static inline int lapic_is_integrated(void) -{ -#ifdef CONFIG_X86_64 - return 1; -#else - return APIC_INTEGRATED(lapic_get_version()); -#endif -} - -/* - * Check, whether this is a modern or a first generation APIC - */ -static int modern_apic(void) -{ - /* AMD systems use old APIC versions, so check the CPU */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 >= 0xf) - return 1; - return lapic_get_version() >= 0x14; -} - -/* - * Paravirt kernels also might be using these below ops. So we still - * use generic apic_read()/apic_write(), which might be pointing to different - * ops in PARAVIRT case. - */ -void xapic_wait_icr_idle(void) -{ - while (apic_read(APIC_ICR) & APIC_ICR_BUSY) - cpu_relax(); -} - -u32 safe_xapic_wait_icr_idle(void) -{ - u32 send_status; - int timeout; - - timeout = 0; - do { - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - if (!send_status) - break; - udelay(100); - } while (timeout++ < 1000); - - return send_status; -} - -void xapic_icr_write(u32 low, u32 id) -{ - apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id)); - apic_write(APIC_ICR, low); -} - -u64 xapic_icr_read(void) -{ - u32 icr1, icr2; - - icr2 = apic_read(APIC_ICR2); - icr1 = apic_read(APIC_ICR); - - return icr1 | ((u64)icr2 << 32); -} - -static struct apic_ops xapic_ops = { - .read = native_apic_mem_read, - .write = native_apic_mem_write, - .icr_read = xapic_icr_read, - .icr_write = xapic_icr_write, - .wait_icr_idle = xapic_wait_icr_idle, - .safe_wait_icr_idle = safe_xapic_wait_icr_idle, -}; - -struct apic_ops __read_mostly *apic_ops = &xapic_ops; -EXPORT_SYMBOL_GPL(apic_ops); - -#ifdef HAVE_X2APIC -static void x2apic_wait_icr_idle(void) -{ - /* no need to wait for icr idle in x2apic */ - return; -} - -static u32 safe_x2apic_wait_icr_idle(void) -{ - /* no need to wait for icr idle in x2apic */ - return 0; -} - -void x2apic_icr_write(u32 low, u32 id) -{ - wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low); -} - -u64 x2apic_icr_read(void) -{ - unsigned long val; - - rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val); - return val; -} - -static struct apic_ops x2apic_ops = { - .read = native_apic_msr_read, - .write = native_apic_msr_write, - .icr_read = x2apic_icr_read, - .icr_write = x2apic_icr_write, - .wait_icr_idle = x2apic_wait_icr_idle, - .safe_wait_icr_idle = safe_x2apic_wait_icr_idle, -}; -#endif - -/** - * enable_NMI_through_LVT0 - enable NMI through local vector table 0 - */ -void __cpuinit enable_NMI_through_LVT0(void) -{ - unsigned int v; - - /* unmask and set to NMI */ - v = APIC_DM_NMI; - - /* Level triggered for 82489DX (32bit mode) */ - if (!lapic_is_integrated()) - v |= APIC_LVT_LEVEL_TRIGGER; - - apic_write(APIC_LVT0, v); -} - -#ifdef CONFIG_X86_32 -/** - * get_physical_broadcast - Get number of physical broadcast IDs - */ -int get_physical_broadcast(void) -{ - return modern_apic() ? 0xff : 0xf; -} -#endif - -/** - * lapic_get_maxlvt - get the maximum number of local vector table entries - */ -int lapic_get_maxlvt(void) -{ - unsigned int v; - - v = apic_read(APIC_LVR); - /* - * - we always have APIC integrated on 64bit mode - * - 82489DXs do not report # of LVT entries - */ - return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; -} - -/* - * Local APIC timer - */ - -/* Clock divisor */ -#ifdef CONFG_X86_64 -#define APIC_DIVISOR 1 -#else -#define APIC_DIVISOR 16 -#endif - -/* - * This function sets up the local APIC timer, with a timeout of - * 'clocks' APIC bus clock. During calibration we actually call - * this function twice on the boot CPU, once with a bogus timeout - * value, second time for real. The other (noncalibrating) CPUs - * call this function only once, with the real, calibrated value. - * - * We do reads before writes even if unnecessary, to get around the - * P5 APIC double write bug. - */ -static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) -{ - unsigned int lvtt_value, tmp_value; - - lvtt_value = LOCAL_TIMER_VECTOR; - if (!oneshot) - lvtt_value |= APIC_LVT_TIMER_PERIODIC; - if (!lapic_is_integrated()) - lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); - - if (!irqen) - lvtt_value |= APIC_LVT_MASKED; - - apic_write(APIC_LVTT, lvtt_value); - - /* - * Divide PICLK by 16 - */ - tmp_value = apic_read(APIC_TDCR); - apic_write(APIC_TDCR, - (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | - APIC_TDR_DIV_16); - - if (!oneshot) - apic_write(APIC_TMICT, clocks / APIC_DIVISOR); -} - -/* - * Setup extended LVT, AMD specific (K8, family 10h) - * - * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and - * MCE interrupts are supported. Thus MCE offset must be set to 0. - * - * If mask=1, the LVT entry does not generate interrupts while mask=0 - * enables the vector. See also the BKDGs. - */ - -#define APIC_EILVT_LVTOFF_MCE 0 -#define APIC_EILVT_LVTOFF_IBS 1 - -static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) -{ - unsigned long reg = (lvt_off << 4) + APIC_EILVT0; - unsigned int v = (mask << 16) | (msg_type << 8) | vector; - - apic_write(reg, v); -} - -u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask) -{ - setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask); - return APIC_EILVT_LVTOFF_MCE; -} - -u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) -{ - setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); - return APIC_EILVT_LVTOFF_IBS; -} -EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs); - -/* - * Program the next event, relative to now - */ -static int lapic_next_event(unsigned long delta, - struct clock_event_device *evt) -{ - apic_write(APIC_TMICT, delta); - return 0; -} - -/* - * Setup the lapic timer in periodic or oneshot mode - */ -static void lapic_timer_setup(enum clock_event_mode mode, - struct clock_event_device *evt) -{ - unsigned long flags; - unsigned int v; - - /* Lapic used as dummy for broadcast ? */ - if (evt->features & CLOCK_EVT_FEAT_DUMMY) - return; - - local_irq_save(flags); - - switch (mode) { - case CLOCK_EVT_MODE_PERIODIC: - case CLOCK_EVT_MODE_ONESHOT: - __setup_APIC_LVTT(calibration_result, - mode != CLOCK_EVT_MODE_PERIODIC, 1); - break; - case CLOCK_EVT_MODE_UNUSED: - case CLOCK_EVT_MODE_SHUTDOWN: - v = apic_read(APIC_LVTT); - v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); - apic_write(APIC_LVTT, v); - break; - case CLOCK_EVT_MODE_RESUME: - /* Nothing to do here */ - break; - } - - local_irq_restore(flags); -} - -/* - * Local APIC timer broadcast function - */ -static void lapic_timer_broadcast(cpumask_t mask) -{ -#ifdef CONFIG_SMP - send_IPI_mask(mask, LOCAL_TIMER_VECTOR); -#endif -} - -/* - * Setup the local APIC timer for this CPU. Copy the initilized values - * of the boot CPU and register the clock event in the framework. - */ -static void __cpuinit setup_APIC_timer(void) -{ - struct clock_event_device *levt = &__get_cpu_var(lapic_events); - - memcpy(levt, &lapic_clockevent, sizeof(*levt)); - levt->cpumask = cpumask_of_cpu(smp_processor_id()); - - clockevents_register_device(levt); -} - -#ifdef CONFIG_X86_64 -/* - * In this function we calibrate APIC bus clocks to the external - * timer. Unfortunately we cannot use jiffies and the timer irq - * to calibrate, since some later bootup code depends on getting - * the first irq? Ugh. - * - * We want to do the calibration only once since we - * want to have local timer irqs syncron. CPUs connected - * by the same APIC bus have the very same bus frequency. - * And we want to have irqs off anyways, no accidental - * APIC irq that way. - */ - -#define TICK_COUNT 100000000 - -static int __init calibrate_APIC_clock(void) -{ - unsigned apic, apic_start; - unsigned long tsc, tsc_start; - int result; - - local_irq_disable(); - - /* - * Put whatever arbitrary (but long enough) timeout - * value into the APIC clock, we just want to get the - * counter running for calibration. - * - * No interrupt enable ! - */ - __setup_APIC_LVTT(250000000, 0, 0); - - apic_start = apic_read(APIC_TMCCT); -#ifdef CONFIG_X86_PM_TIMER - if (apic_calibrate_pmtmr && pmtmr_ioport) { - pmtimer_wait(5000); /* 5ms wait */ - apic = apic_read(APIC_TMCCT); - result = (apic_start - apic) * 1000L / 5; - } else -#endif - { - rdtscll(tsc_start); - - do { - apic = apic_read(APIC_TMCCT); - rdtscll(tsc); - } while ((tsc - tsc_start) < TICK_COUNT && - (apic_start - apic) < TICK_COUNT); - - result = (apic_start - apic) * 1000L * tsc_khz / - (tsc - tsc_start); - } - - local_irq_enable(); - - printk(KERN_DEBUG "APIC timer calibration result %d\n", result); - - printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", - result / 1000 / 1000, result / 1000 % 1000); - - /* Calculate the scaled math multiplication factor */ - lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC, - lapic_clockevent.shift); - lapic_clockevent.max_delta_ns = - clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); - lapic_clockevent.min_delta_ns = - clockevent_delta2ns(0xF, &lapic_clockevent); - - calibration_result = (result * APIC_DIVISOR) / HZ; - - /* - * Do a sanity check on the APIC calibration result - */ - if (calibration_result < (1000000 / HZ)) { - printk(KERN_WARNING - "APIC frequency too slow, disabling apic timer\n"); - return -1; - } - - return 0; -} - -#else -/* - * In this functions we calibrate APIC bus clocks to the external timer. - * - * We want to do the calibration only once since we want to have local timer - * irqs syncron. CPUs connected by the same APIC bus have the very same bus - * frequency. - * - * This was previously done by reading the PIT/HPET and waiting for a wrap - * around to find out, that a tick has elapsed. I have a box, where the PIT - * readout is broken, so it never gets out of the wait loop again. This was - * also reported by others. - * - * Monitoring the jiffies value is inaccurate and the clockevents - * infrastructure allows us to do a simple substitution of the interrupt - * handler. - * - * The calibration routine also uses the pm_timer when possible, as the PIT - * happens to run way too slow (factor 2.3 on my VAIO CoreDuo, which goes - * back to normal later in the boot process). - */ - -#define LAPIC_CAL_LOOPS (HZ/10) - -static __initdata int lapic_cal_loops = -1; -static __initdata long lapic_cal_t1, lapic_cal_t2; -static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2; -static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2; -static __initdata unsigned long lapic_cal_j1, lapic_cal_j2; - -/* - * Temporary interrupt handler. - */ -static void __init lapic_cal_handler(struct clock_event_device *dev) -{ - unsigned long long tsc = 0; - long tapic = apic_read(APIC_TMCCT); - unsigned long pm = acpi_pm_read_early(); - - if (cpu_has_tsc) - rdtscll(tsc); - - switch (lapic_cal_loops++) { - case 0: - lapic_cal_t1 = tapic; - lapic_cal_tsc1 = tsc; - lapic_cal_pm1 = pm; - lapic_cal_j1 = jiffies; - break; - - case LAPIC_CAL_LOOPS: - lapic_cal_t2 = tapic; - lapic_cal_tsc2 = tsc; - if (pm < lapic_cal_pm1) - pm += ACPI_PM_OVRRUN; - lapic_cal_pm2 = pm; - lapic_cal_j2 = jiffies; - break; - } -} - -static int __init calibrate_APIC_clock(void) -{ - struct clock_event_device *levt = &__get_cpu_var(lapic_events); - const long pm_100ms = PMTMR_TICKS_PER_SEC/10; - const long pm_thresh = pm_100ms/100; - void (*real_handler)(struct clock_event_device *dev); - unsigned long deltaj; - long delta, deltapm; - int pm_referenced = 0; - - local_irq_disable(); - - /* Replace the global interrupt handler */ - real_handler = global_clock_event->event_handler; - global_clock_event->event_handler = lapic_cal_handler; - - /* - * Setup the APIC counter to 1e9. There is no way the lapic - * can underflow in the 100ms detection time frame - */ - __setup_APIC_LVTT(1000000000, 0, 0); - - /* Let the interrupts run */ - local_irq_enable(); - - while (lapic_cal_loops <= LAPIC_CAL_LOOPS) - cpu_relax(); - - local_irq_disable(); - - /* Restore the real event handler */ - global_clock_event->event_handler = real_handler; - - /* Build delta t1-t2 as apic timer counts down */ - delta = lapic_cal_t1 - lapic_cal_t2; - apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); - - /* Check, if the PM timer is available */ - deltapm = lapic_cal_pm2 - lapic_cal_pm1; - apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); - - if (deltapm) { - unsigned long mult; - u64 res; - - mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22); - - if (deltapm > (pm_100ms - pm_thresh) && - deltapm < (pm_100ms + pm_thresh)) { - apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); - } else { - res = (((u64) deltapm) * mult) >> 22; - do_div(res, 1000000); - printk(KERN_WARNING "APIC calibration not consistent " - "with PM Timer: %ldms instead of 100ms\n", - (long)res); - /* Correct the lapic counter value */ - res = (((u64) delta) * pm_100ms); - do_div(res, deltapm); - printk(KERN_INFO "APIC delta adjusted to PM-Timer: " - "%lu (%ld)\n", (unsigned long) res, delta); - delta = (long) res; - } - pm_referenced = 1; - } - - /* Calculate the scaled math multiplication factor */ - lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, - lapic_clockevent.shift); - lapic_clockevent.max_delta_ns = - clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); - lapic_clockevent.min_delta_ns = - clockevent_delta2ns(0xF, &lapic_clockevent); - - calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; - - apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); - apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult); - apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", - calibration_result); - - if (cpu_has_tsc) { - delta = (long)(lapic_cal_tsc2 - lapic_cal_tsc1); - apic_printk(APIC_VERBOSE, "..... CPU clock speed is " - "%ld.%04ld MHz.\n", - (delta / LAPIC_CAL_LOOPS) / (1000000 / HZ), - (delta / LAPIC_CAL_LOOPS) % (1000000 / HZ)); - } - - apic_printk(APIC_VERBOSE, "..... host bus clock speed is " - "%u.%04u MHz.\n", - calibration_result / (1000000 / HZ), - calibration_result % (1000000 / HZ)); - - /* - * Do a sanity check on the APIC calibration result - */ - if (calibration_result < (1000000 / HZ)) { - local_irq_enable(); - printk(KERN_WARNING - "APIC frequency too slow, disabling apic timer\n"); - return -1; - } - - levt->features &= ~CLOCK_EVT_FEAT_DUMMY; - - /* We trust the pm timer based calibration */ - if (!pm_referenced) { - apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); - - /* - * Setup the apic timer manually - */ - levt->event_handler = lapic_cal_handler; - lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt); - lapic_cal_loops = -1; - - /* Let the interrupts run */ - local_irq_enable(); - - while (lapic_cal_loops <= LAPIC_CAL_LOOPS) - cpu_relax(); - - local_irq_disable(); - - /* Stop the lapic timer */ - lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt); - - local_irq_enable(); - - /* Jiffies delta */ - deltaj = lapic_cal_j2 - lapic_cal_j1; - apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj); - - /* Check, if the jiffies result is consistent */ - if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) - apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); - else - levt->features |= CLOCK_EVT_FEAT_DUMMY; - } else - local_irq_enable(); - - if (levt->features & CLOCK_EVT_FEAT_DUMMY) { - printk(KERN_WARNING - "APIC timer disabled due to verification failure.\n"); - return -1; - } - - return 0; -} - -#endif - -/* - * Setup the boot APIC - * - * Calibrate and verify the result. - */ -void __init setup_boot_APIC_clock(void) -{ - /* - * The local apic timer can be disabled via the kernel - * commandline or from the CPU detection code. Register the lapic - * timer as a dummy clock event source on SMP systems, so the - * broadcast mechanism is used. On UP systems simply ignore it. - */ - if (disable_apic_timer) { - printk(KERN_INFO "Disabling APIC timer\n"); - /* No broadcast on UP ! */ - if (num_possible_cpus() > 1) { - lapic_clockevent.mult = 1; - setup_APIC_timer(); - } - return; - } - - apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" - "calibrating APIC timer ...\n"); - - if (calibrate_APIC_clock()) { - /* No broadcast on UP ! */ - if (num_possible_cpus() > 1) - setup_APIC_timer(); - return; - } - - /* - * If nmi_watchdog is set to IO_APIC, we need the - * PIT/HPET going. Otherwise register lapic as a dummy - * device. - */ - if (nmi_watchdog != NMI_IO_APIC) - lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; - else - printk(KERN_WARNING "APIC timer registered as dummy," - " due to nmi_watchdog=%d!\n", nmi_watchdog); - - /* Setup the lapic or request the broadcast */ - setup_APIC_timer(); -} - -void __cpuinit setup_secondary_APIC_clock(void) -{ - setup_APIC_timer(); -} - -/* - * The guts of the apic timer interrupt - */ -static void local_apic_timer_interrupt(void) -{ - int cpu = smp_processor_id(); - struct clock_event_device *evt = &per_cpu(lapic_events, cpu); - - /* - * Normally we should not be here till LAPIC has been initialized but - * in some cases like kdump, its possible that there is a pending LAPIC - * timer interrupt from previous kernel's context and is delivered in - * new kernel the moment interrupts are enabled. - * - * Interrupts are enabled early and LAPIC is setup much later, hence - * its possible that when we get here evt->event_handler is NULL. - * Check for event_handler being NULL and discard the interrupt as - * spurious. - */ - if (!evt->event_handler) { - printk(KERN_WARNING - "Spurious LAPIC timer interrupt on cpu %d\n", cpu); - /* Switch it off */ - lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt); - return; - } - - /* - * the NMI deadlock-detector uses this. - */ -#ifdef CONFIG_X86_64 - add_pda(apic_timer_irqs, 1); -#else - per_cpu(irq_stat, cpu).apic_timer_irqs++; -#endif - - evt->event_handler(evt); -} - -/* - * Local APIC timer interrupt. This is the most natural way for doing - * local interrupts, but local timer interrupts can be emulated by - * broadcast interrupts too. [in case the hw doesn't support APIC timers] - * - * [ if a single-CPU system runs an SMP kernel then we call the local - * interrupt as well. Thus we cannot inline the local irq ... ] - */ -void smp_apic_timer_interrupt(struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - - /* - * NOTE! We'd better ACK the irq immediately, - * because timer handling can be slow. - */ - ack_APIC_irq(); - /* - * update_process_times() expects us to have done irq_enter(). - * Besides, if we don't timer interrupts ignore the global - * interrupt lock, which is the WrongThing (tm) to do. - */ -#ifdef CONFIG_X86_64 - exit_idle(); -#endif - irq_enter(); - local_apic_timer_interrupt(); - irq_exit(); - - set_irq_regs(old_regs); -} - -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - -/* - * Local APIC start and shutdown - */ - -/** - * clear_local_APIC - shutdown the local APIC - * - * This is called, when a CPU is disabled and before rebooting, so the state of - * the local APIC has no dangling leftovers. Also used to cleanout any BIOS - * leftovers during boot. - */ -void clear_local_APIC(void) -{ - int maxlvt; - u32 v; - - /* APIC hasn't been mapped yet */ - if (!apic_phys) - return; - - maxlvt = lapic_get_maxlvt(); - /* - * Masking an LVT entry can trigger a local APIC error - * if the vector is zero. Mask LVTERR first to prevent this. - */ - if (maxlvt >= 3) { - v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ - apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); - } - /* - * Careful: we have to set masks only first to deassert - * any level-triggered sources. - */ - v = apic_read(APIC_LVTT); - apic_write(APIC_LVTT, v | APIC_LVT_MASKED); - v = apic_read(APIC_LVT0); - apic_write(APIC_LVT0, v | APIC_LVT_MASKED); - v = apic_read(APIC_LVT1); - apic_write(APIC_LVT1, v | APIC_LVT_MASKED); - if (maxlvt >= 4) { - v = apic_read(APIC_LVTPC); - apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); - } - - /* lets not touch this if we didn't frob it */ -#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL) - if (maxlvt >= 5) { - v = apic_read(APIC_LVTTHMR); - apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); - } -#endif - /* - * Clean APIC state for other OSs: - */ - apic_write(APIC_LVTT, APIC_LVT_MASKED); - apic_write(APIC_LVT0, APIC_LVT_MASKED); - apic_write(APIC_LVT1, APIC_LVT_MASKED); - if (maxlvt >= 3) - apic_write(APIC_LVTERR, APIC_LVT_MASKED); - if (maxlvt >= 4) - apic_write(APIC_LVTPC, APIC_LVT_MASKED); - - /* Integrated APIC (!82489DX) ? */ - if (lapic_is_integrated()) { - if (maxlvt > 3) - /* Clear ESR due to Pentium errata 3AP and 11AP */ - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - } -} - -/** - * disable_local_APIC - clear and disable the local APIC - */ -void disable_local_APIC(void) -{ - unsigned int value; - - clear_local_APIC(); - - /* - * Disable APIC (implies clearing of registers - * for 82489DX!). - */ - value = apic_read(APIC_SPIV); - value &= ~APIC_SPIV_APIC_ENABLED; - apic_write(APIC_SPIV, value); - -#ifdef CONFIG_X86_32 - /* - * When LAPIC was disabled by the BIOS and enabled by the kernel, - * restore the disabled state. - */ - if (enabled_via_apicbase) { - unsigned int l, h; - - rdmsr(MSR_IA32_APICBASE, l, h); - l &= ~MSR_IA32_APICBASE_ENABLE; - wrmsr(MSR_IA32_APICBASE, l, h); - } -#endif -} - -/* - * If Linux enabled the LAPIC against the BIOS default disable it down before - * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and - * not power-off. Additionally clear all LVT entries before disable_local_APIC - * for the case where Linux didn't enable the LAPIC. - */ -void lapic_shutdown(void) -{ - unsigned long flags; - - if (!cpu_has_apic) - return; - - local_irq_save(flags); - -#ifdef CONFIG_X86_32 - if (!enabled_via_apicbase) - clear_local_APIC(); - else -#endif - disable_local_APIC(); - - - local_irq_restore(flags); -} - -/* - * This is to verify that we're looking at a real local APIC. - * Check these against your board if the CPUs aren't getting - * started for no apparent reason. - */ -int __init verify_local_APIC(void) -{ - unsigned int reg0, reg1; - - /* - * The version register is read-only in a real APIC. - */ - reg0 = apic_read(APIC_LVR); - apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0); - apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK); - reg1 = apic_read(APIC_LVR); - apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1); - - /* - * The two version reads above should print the same - * numbers. If the second one is different, then we - * poke at a non-APIC. - */ - if (reg1 != reg0) - return 0; - - /* - * Check if the version looks reasonably. - */ - reg1 = GET_APIC_VERSION(reg0); - if (reg1 == 0x00 || reg1 == 0xff) - return 0; - reg1 = lapic_get_maxlvt(); - if (reg1 < 0x02 || reg1 == 0xff) - return 0; - - /* - * The ID register is read/write in a real APIC. - */ - reg0 = apic_read(APIC_ID); - apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); - apic_write(APIC_ID, reg0 ^ APIC_ID_MASK); - reg1 = apic_read(APIC_ID); - apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); - apic_write(APIC_ID, reg0); - if (reg1 != (reg0 ^ APIC_ID_MASK)) - return 0; - - /* - * The next two are just to see if we have sane values. - * They're only really relevant if we're in Virtual Wire - * compatibility mode, but most boxes are anymore. - */ - reg0 = apic_read(APIC_LVT0); - apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0); - reg1 = apic_read(APIC_LVT1); - apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); - - return 1; -} - -/** - * sync_Arb_IDs - synchronize APIC bus arbitration IDs - */ -void __init sync_Arb_IDs(void) -{ - /* - * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not - * needed on AMD. - */ - if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD) - return; - - /* - * Wait for idle. - */ - apic_wait_icr_idle(); - - apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); - apic_write(APIC_ICR, APIC_DEST_ALLINC | - APIC_INT_LEVELTRIG | APIC_DM_INIT); -} - -/* - * An initial setup of the virtual wire mode. - */ -void __init init_bsp_APIC(void) -{ - unsigned int value; - - /* - * Don't do the setup now if we have a SMP BIOS as the - * through-I/O-APIC virtual wire mode might be active. - */ - if (smp_found_config || !cpu_has_apic) - return; - - /* - * Do not trust the local APIC being empty at bootup. - */ - clear_local_APIC(); - - /* - * Enable APIC. - */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - -#ifdef CONFIG_X86_32 - /* This bit is reserved on P4/Xeon and should be cleared */ - if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && - (boot_cpu_data.x86 == 15)) - value &= ~APIC_SPIV_FOCUS_DISABLED; - else -#endif - value |= APIC_SPIV_FOCUS_DISABLED; - value |= SPURIOUS_APIC_VECTOR; - apic_write(APIC_SPIV, value); - - /* - * Set up the virtual wire mode. - */ - apic_write(APIC_LVT0, APIC_DM_EXTINT); - value = APIC_DM_NMI; - if (!lapic_is_integrated()) /* 82489DX */ - value |= APIC_LVT_LEVEL_TRIGGER; - apic_write(APIC_LVT1, value); -} - -static void __cpuinit lapic_setup_esr(void) -{ - unsigned long oldvalue, value, maxlvt; - if (lapic_is_integrated() && !esr_disable) { - if (esr_disable) { - /* - * Something untraceable is creating bad interrupts on - * secondary quads ... for the moment, just leave the - * ESR disabled - we can't do anything useful with the - * errors anyway - mbligh - */ - printk(KERN_INFO "Leaving ESR disabled.\n"); - return; - } - /* !82489DX */ - maxlvt = lapic_get_maxlvt(); - if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ - apic_write(APIC_ESR, 0); - oldvalue = apic_read(APIC_ESR); - - /* enables sending errors */ - value = ERROR_APIC_VECTOR; - apic_write(APIC_LVTERR, value); - /* - * spec says clear errors after enabling vector. - */ - if (maxlvt > 3) - apic_write(APIC_ESR, 0); - value = apic_read(APIC_ESR); - if (value != oldvalue) - apic_printk(APIC_VERBOSE, "ESR value before enabling " - "vector: 0x%08lx after: 0x%08lx\n", - oldvalue, value); - } else { - printk(KERN_INFO "No ESR for 82489DX.\n"); - } -} - - -/** - * setup_local_APIC - setup the local APIC - */ -void __cpuinit setup_local_APIC(void) -{ - unsigned int value; - int i, j; - -#ifdef CONFIG_X86_32 - /* Pound the ESR really hard over the head with a big hammer - mbligh */ - if (esr_disable) { - apic_write(APIC_ESR, 0); - apic_write(APIC_ESR, 0); - apic_write(APIC_ESR, 0); - apic_write(APIC_ESR, 0); - } -#endif - - preempt_disable(); - - /* - * Double-check whether this APIC is really registered. - * This is meaningless in clustered apic mode, so we skip it. - */ - if (!apic_id_registered()) - BUG(); - - /* - * Intel recommends to set DFR, LDR and TPR before enabling - * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel - * document number 292116). So here it goes... - */ - init_apic_ldr(); - - /* - * Set Task Priority to 'accept all'. We never change this - * later on. - */ - value = apic_read(APIC_TASKPRI); - value &= ~APIC_TPRI_MASK; - apic_write(APIC_TASKPRI, value); - - /* - * After a crash, we no longer service the interrupts and a pending - * interrupt from previous kernel might still have ISR bit set. - * - * Most probably by now CPU has serviced that pending interrupt and - * it might not have done the ack_APIC_irq() because it thought, - * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it - * does not clear the ISR bit and cpu thinks it has already serivced - * the interrupt. Hence a vector might get locked. It was noticed - * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. - */ - for (i = APIC_ISR_NR - 1; i >= 0; i--) { - value = apic_read(APIC_ISR + i*0x10); - for (j = 31; j >= 0; j--) { - if (value & (1< 1) || - (boot_cpu_data.x86 == 15)) - break; - goto no_apic; - case X86_VENDOR_INTEL: - if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 || - (boot_cpu_data.x86 == 5 && cpu_has_apic)) - break; - goto no_apic; - default: - goto no_apic; - } - - if (!cpu_has_apic) { - /* - * Over-ride BIOS and try to enable the local APIC only if - * "lapic" specified. - */ - if (!force_enable_local_apic) { - printk(KERN_INFO "Local APIC disabled by BIOS -- " - "you can enable it with \"lapic\"\n"); - return -1; - } - /* - * Some BIOSes disable the local APIC in the APIC_BASE - * MSR. This can only be done in software for Intel P6 or later - * and AMD K7 (Model > 1) or later. - */ - rdmsr(MSR_IA32_APICBASE, l, h); - if (!(l & MSR_IA32_APICBASE_ENABLE)) { - printk(KERN_INFO - "Local APIC disabled by BIOS -- reenabling.\n"); - l &= ~MSR_IA32_APICBASE_BASE; - l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; - wrmsr(MSR_IA32_APICBASE, l, h); - enabled_via_apicbase = 1; - } - } - /* - * The APIC feature bit should now be enabled - * in `cpuid' - */ - features = cpuid_edx(1); - if (!(features & (1 << X86_FEATURE_APIC))) { - printk(KERN_WARNING "Could not enable APIC!\n"); - return -1; - } - set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - - /* The BIOS may have set up the APIC at some other address */ - rdmsr(MSR_IA32_APICBASE, l, h); - if (l & MSR_IA32_APICBASE_ENABLE) - mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; - - printk(KERN_INFO "Found and enabled local APIC!\n"); - - apic_pm_activate(); - - return 0; - -no_apic: - printk(KERN_INFO "No local APIC present or hardware disabled\n"); - return -1; -} -#endif - -#ifdef CONFIG_X86_64 -void __init early_init_lapic_mapping(void) -{ - unsigned long phys_addr; - - /* - * If no local APIC can be found then go out - * : it means there is no mpatable and MADT - */ - if (!smp_found_config) - return; - - phys_addr = mp_lapic_addr; - - set_fixmap_nocache(FIX_APIC_BASE, phys_addr); - apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", - APIC_BASE, phys_addr); - - /* - * Fetch the APIC ID of the BSP in case we have a - * default configuration (or the MP table is broken). - */ - boot_cpu_physical_apicid = read_apic_id(); -} -#endif - -/** - * init_apic_mappings - initialize APIC mappings - */ -void __init init_apic_mappings(void) -{ -#ifdef HAVE_X2APIC - if (x2apic) { - boot_cpu_physical_apicid = read_apic_id(); - return; - } -#endif - - /* - * If no local APIC can be found then set up a fake all - * zeroes page to simulate the local APIC and another - * one for the IO-APIC. - */ - if (!smp_found_config && detect_init_APIC()) { - apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); - apic_phys = __pa(apic_phys); - } else - apic_phys = mp_lapic_addr; - - set_fixmap_nocache(FIX_APIC_BASE, apic_phys); - apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", - APIC_BASE, apic_phys); - - /* - * Fetch the APIC ID of the BSP in case we have a - * default configuration (or the MP table is broken). - */ - if (boot_cpu_physical_apicid == -1U) - boot_cpu_physical_apicid = read_apic_id(); -} - -/* - * This initializes the IO-APIC and APIC hardware if this is - * a UP kernel. - */ -int apic_version[MAX_APICS]; - -int __init APIC_init_uniprocessor(void) -{ -#ifdef CONFIG_X86_64 - if (disable_apic) { - printk(KERN_INFO "Apic disabled\n"); - return -1; - } - if (!cpu_has_apic) { - disable_apic = 1; - printk(KERN_INFO "Apic disabled by BIOS\n"); - return -1; - } -#else - if (!smp_found_config && !cpu_has_apic) - return -1; - - /* - * Complain if the BIOS pretends there is one. - */ - if (!cpu_has_apic && - APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { - printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", - boot_cpu_physical_apicid); - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); - return -1; - } -#endif - -#ifdef HAVE_X2APIC - enable_IR_x2apic(); -#endif -#ifdef CONFIG_X86_64 - setup_apic_routing(); -#endif - - verify_local_APIC(); - connect_bsp_APIC(); - -#ifdef CONFIG_X86_64 - apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid)); -#else - /* - * Hack: In case of kdump, after a crash, kernel might be booting - * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid - * might be zero if read from MP tables. Get it from LAPIC. - */ -# ifdef CONFIG_CRASH_DUMP - boot_cpu_physical_apicid = read_apic_id(); -# endif -#endif - physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); - setup_local_APIC(); - -#ifdef CONFIG_X86_64 - /* - * Now enable IO-APICs, actually call clear_IO_APIC - * We need clear_IO_APIC before enabling vector on BP - */ - if (!skip_ioapic_setup && nr_ioapics) - enable_IO_APIC(); -#endif - -#ifdef CONFIG_X86_IO_APIC - if (!smp_found_config || skip_ioapic_setup || !nr_ioapics) -#endif - localise_nmi_watchdog(); - end_local_APIC_setup(); - -#ifdef CONFIG_X86_IO_APIC - if (smp_found_config && !skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); -# ifdef CONFIG_X86_64 - else - nr_ioapics = 0; -# endif -#endif - -#ifdef CONFIG_X86_64 - setup_boot_APIC_clock(); - check_nmi_watchdog(); -#else - setup_boot_clock(); -#endif - - return 0; -} - -/* - * Local APIC interrupts - */ - -/* - * This interrupt should _never_ happen with our APIC/SMP architecture - */ -#ifdef CONFIG_X86_64 -asmlinkage void smp_spurious_interrupt(void) -#else -void smp_spurious_interrupt(struct pt_regs *regs) -#endif -{ - u32 v; - -#ifdef CONFIG_X86_64 - exit_idle(); -#endif - irq_enter(); - /* - * Check if this really is a spurious interrupt and ACK it - * if it is a vectored one. Just in case... - * Spurious interrupts should not be ACKed. - */ - v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); - if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) - ack_APIC_irq(); - -#ifdef CONFIG_X86_64 - add_pda(irq_spurious_count, 1); -#else - /* see sw-dev-man vol 3, chapter 7.4.13.5 */ - printk(KERN_INFO "spurious APIC interrupt on CPU#%d, " - "should never happen.\n", smp_processor_id()); - __get_cpu_var(irq_stat).irq_spurious_count++; -#endif - irq_exit(); -} - -/* - * This interrupt should never happen with our APIC/SMP architecture - */ -#ifdef CONFIG_X86_64 -asmlinkage void smp_error_interrupt(void) -#else -void smp_error_interrupt(struct pt_regs *regs) -#endif -{ - u32 v, v1; - -#ifdef CONFIG_X86_64 - exit_idle(); -#endif - irq_enter(); - /* First tickle the hardware, only then report what went on. -- REW */ - v = apic_read(APIC_ESR); - apic_write(APIC_ESR, 0); - v1 = apic_read(APIC_ESR); - ack_APIC_irq(); - atomic_inc(&irq_err_count); - - /* Here is what the APIC error bits mean: - 0: Send CS error - 1: Receive CS error - 2: Send accept error - 3: Receive accept error - 4: Reserved - 5: Send illegal vector - 6: Received illegal vector - 7: Illegal register address - */ - printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", - smp_processor_id(), v , v1); - irq_exit(); -} - -/** - * connect_bsp_APIC - attach the APIC to the interrupt system - */ -void __init connect_bsp_APIC(void) -{ -#ifdef CONFIG_X86_32 - if (pic_mode) { - /* - * Do not trust the local APIC being empty at bootup. - */ - clear_local_APIC(); - /* - * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's - * local APIC to INT and NMI lines. - */ - apic_printk(APIC_VERBOSE, "leaving PIC mode, " - "enabling APIC mode.\n"); - outb(0x70, 0x22); - outb(0x01, 0x23); - } -#endif - enable_apic_mode(); -} - -/** - * disconnect_bsp_APIC - detach the APIC from the interrupt system - * @virt_wire_setup: indicates, whether virtual wire mode is selected - * - * Virtual wire mode is necessary to deliver legacy interrupts even when the - * APIC is disabled. - */ -void disconnect_bsp_APIC(int virt_wire_setup) -{ - unsigned int value; - -#ifdef CONFIG_X86_32 - if (pic_mode) { - /* - * Put the board back into PIC mode (has an effect only on - * certain older boards). Note that APIC interrupts, including - * IPIs, won't work beyond this point! The only exception are - * INIT IPIs. - */ - apic_printk(APIC_VERBOSE, "disabling APIC mode, " - "entering PIC mode.\n"); - outb(0x70, 0x22); - outb(0x00, 0x23); - return; - } -#endif - - /* Go back to Virtual Wire compatibility mode */ - - /* For the spurious interrupt use vector F, and enable it */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - value |= 0xf; - apic_write(APIC_SPIV, value); - - if (!virt_wire_setup) { - /* - * For LVT0 make it edge triggered, active high, - * external and enabled - */ - value = apic_read(APIC_LVT0); - value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); - apic_write(APIC_LVT0, value); - } else { - /* Disable LVT0 */ - apic_write(APIC_LVT0, APIC_LVT_MASKED); - } - - /* - * For LVT1 make it edge triggered, active high, - * nmi and enabled - */ - value = apic_read(APIC_LVT1); - value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); - apic_write(APIC_LVT1, value); -} - -void __cpuinit generic_processor_info(int apicid, int version) -{ - int cpu; - cpumask_t tmp_map; - - /* - * Validate version - */ - if (version == 0x0) { - printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! " - "fixing up to 0x10. (tell your hw vendor)\n", - version); - version = 0x10; - } - apic_version[apicid] = version; - - if (num_processors >= NR_CPUS) { - printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." - " Processor ignored.\n", NR_CPUS); - return; - } - - num_processors++; - cpus_complement(tmp_map, cpu_present_map); - cpu = first_cpu(tmp_map); - - physid_set(apicid, phys_cpu_present_map); - if (apicid == boot_cpu_physical_apicid) { - /* - * x86_bios_cpu_apicid is required to have processors listed - * in same order as logical cpu numbers. Hence the first - * entry is BSP, and so on. - */ - cpu = 0; - } - if (apicid > max_physical_apicid) - max_physical_apicid = apicid; - -#ifdef CONFIG_X86_32 - /* - * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y - * but we need to work other dependencies like SMP_SUSPEND etc - * before this can be done without some confusion. - * if (CPU_HOTPLUG_ENABLED || num_processors > 8) - * - Ashok Raj - */ - if (max_physical_apicid >= 8) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - if (!APIC_XAPIC(version)) { - def_to_bigsmp = 0; - break; - } - /* If P4 and above fall through */ - case X86_VENDOR_AMD: - def_to_bigsmp = 1; - } - } -#endif - -#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64) - /* are we being called early in kernel startup? */ - if (early_per_cpu_ptr(x86_cpu_to_apicid)) { - u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); - u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); - - cpu_to_apicid[cpu] = apicid; - bios_cpu_apicid[cpu] = apicid; - } else { - per_cpu(x86_cpu_to_apicid, cpu) = apicid; - per_cpu(x86_bios_cpu_apicid, cpu) = apicid; - } -#endif - - cpu_set(cpu, cpu_possible_map); - cpu_set(cpu, cpu_present_map); -} - -#ifdef CONFIG_X86_64 -int hard_smp_processor_id(void) -{ - return read_apic_id(); -} -#endif - -/* - * Power management - */ -#ifdef CONFIG_PM - -static struct { - /* - * 'active' is true if the local APIC was enabled by us and - * not the BIOS; this signifies that we are also responsible - * for disabling it before entering apm/acpi suspend - */ - int active; - /* r/w apic fields */ - unsigned int apic_id; - unsigned int apic_taskpri; - unsigned int apic_ldr; - unsigned int apic_dfr; - unsigned int apic_spiv; - unsigned int apic_lvtt; - unsigned int apic_lvtpc; - unsigned int apic_lvt0; - unsigned int apic_lvt1; - unsigned int apic_lvterr; - unsigned int apic_tmict; - unsigned int apic_tdcr; - unsigned int apic_thmr; -} apic_pm_state; - -static int lapic_suspend(struct sys_device *dev, pm_message_t state) -{ - unsigned long flags; - int maxlvt; - - if (!apic_pm_state.active) - return 0; - - maxlvt = lapic_get_maxlvt(); - - apic_pm_state.apic_id = apic_read(APIC_ID); - apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); - apic_pm_state.apic_ldr = apic_read(APIC_LDR); - apic_pm_state.apic_dfr = apic_read(APIC_DFR); - apic_pm_state.apic_spiv = apic_read(APIC_SPIV); - apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); - if (maxlvt >= 4) - apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); - apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); - apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); - apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); - apic_pm_state.apic_tmict = apic_read(APIC_TMICT); - apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); -#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) - if (maxlvt >= 5) - apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); -#endif - - local_irq_save(flags); - disable_local_APIC(); - local_irq_restore(flags); - return 0; -} - -static int lapic_resume(struct sys_device *dev) -{ - unsigned int l, h; - unsigned long flags; - int maxlvt; - - if (!apic_pm_state.active) - return 0; - - maxlvt = lapic_get_maxlvt(); - - local_irq_save(flags); - -#ifdef HAVE_X2APIC - if (x2apic) - enable_x2apic(); - else -#endif - { - /* - * Make sure the APICBASE points to the right address - * - * FIXME! This will be wrong if we ever support suspend on - * SMP! We'll need to do this as part of the CPU restore! - */ - rdmsr(MSR_IA32_APICBASE, l, h); - l &= ~MSR_IA32_APICBASE_BASE; - l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; - wrmsr(MSR_IA32_APICBASE, l, h); - } - - apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); - apic_write(APIC_ID, apic_pm_state.apic_id); - apic_write(APIC_DFR, apic_pm_state.apic_dfr); - apic_write(APIC_LDR, apic_pm_state.apic_ldr); - apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); - apic_write(APIC_SPIV, apic_pm_state.apic_spiv); - apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); - apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); -#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) - if (maxlvt >= 5) - apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); -#endif - if (maxlvt >= 4) - apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); - apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); - apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); - apic_write(APIC_TMICT, apic_pm_state.apic_tmict); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - - local_irq_restore(flags); - - return 0; -} - -/* - * This device has no shutdown method - fully functioning local APICs - * are needed on every CPU up until machine_halt/restart/poweroff. - */ - -static struct sysdev_class lapic_sysclass = { - .name = "lapic", - .resume = lapic_resume, - .suspend = lapic_suspend, -}; - -static struct sys_device device_lapic = { - .id = 0, - .cls = &lapic_sysclass, -}; - -static void __cpuinit apic_pm_activate(void) -{ - apic_pm_state.active = 1; -} - -static int __init init_lapic_sysfs(void) -{ - int error; - - if (!cpu_has_apic) - return 0; - /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ - - error = sysdev_class_register(&lapic_sysclass); - if (!error) - error = sysdev_register(&device_lapic); - return error; -} -device_initcall(init_lapic_sysfs); - -#else /* CONFIG_PM */ - -static void apic_pm_activate(void) { } - -#endif /* CONFIG_PM */ - -#ifdef CONFIG_X86_64 -/* - * apic_is_clustered_box() -- Check if we can expect good TSC - * - * Thus far, the major user of this is IBM's Summit2 series: - * - * Clustered boxes may have unsynced TSC problems if they are - * multi-chassis. Use available data to take a good guess. - * If in doubt, go HPET. - */ -__cpuinit int apic_is_clustered_box(void) -{ - int i, clusters, zeros; - unsigned id; - u16 *bios_cpu_apicid; - DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); - - /* - * there is not this kind of box with AMD CPU yet. - * Some AMD box with quadcore cpu and 8 sockets apicid - * will be [4, 0x23] or [8, 0x27] could be thought to - * vsmp box still need checking... - */ - if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box()) - return 0; - - bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); - bitmap_zero(clustermap, NUM_APIC_CLUSTERS); - - for (i = 0; i < NR_CPUS; i++) { - /* are we being called early in kernel startup? */ - if (bios_cpu_apicid) { - id = bios_cpu_apicid[i]; - } - else if (i < nr_cpu_ids) { - if (cpu_present(i)) - id = per_cpu(x86_bios_cpu_apicid, i); - else - continue; - } - else - break; - - if (id != BAD_APICID) - __set_bit(APIC_CLUSTERID(id), clustermap); - } - - /* Problem: Partially populated chassis may not have CPUs in some of - * the APIC clusters they have been allocated. Only present CPUs have - * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap. - * Since clusters are allocated sequentially, count zeros only if - * they are bounded by ones. - */ - clusters = 0; - zeros = 0; - for (i = 0; i < NUM_APIC_CLUSTERS; i++) { - if (test_bit(i, clustermap)) { - clusters += 1 + zeros; - zeros = 0; - } else - ++zeros; - } - - /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are - * not guaranteed to be synced between boards - */ - if (is_vsmp_box() && clusters > 1) - return 1; - - /* - * If clusters > 2, then should be multi-chassis. - * May have to revisit this when multi-core + hyperthreaded CPUs come - * out, but AFAIK this will work even for them. - */ - return (clusters > 2); -} -#endif - -/* - * APIC command line parameters - */ -static int __init setup_disableapic(char *arg) -{ - disable_apic = 1; - setup_clear_cpu_cap(X86_FEATURE_APIC); - return 0; -} -early_param("disableapic", setup_disableapic); - -/* same as disableapic, for compatibility */ -static int __init setup_nolapic(char *arg) -{ - return setup_disableapic(arg); -} -early_param("nolapic", setup_nolapic); - -static int __init parse_lapic_timer_c2_ok(char *arg) -{ - local_apic_timer_c2_ok = 1; - return 0; -} -early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); - -static int __init parse_disable_apic_timer(char *arg) -{ - disable_apic_timer = 1; - return 0; -} -early_param("noapictimer", parse_disable_apic_timer); - -static int __init parse_nolapic_timer(char *arg) -{ - disable_apic_timer = 1; - return 0; -} -early_param("nolapic_timer", parse_nolapic_timer); - -static int __init apic_set_verbosity(char *arg) -{ - if (!arg) { -#ifdef CONFIG_X86_64 - skip_ioapic_setup = 0; - return 0; -#endif - return -EINVAL; - } - - if (strcmp("debug", arg) == 0) - apic_verbosity = APIC_DEBUG; - else if (strcmp("verbose", arg) == 0) - apic_verbosity = APIC_VERBOSE; - else { - printk(KERN_WARNING "APIC Verbosity level %s not recognised" - " use apic=verbose or apic=debug\n", arg); - return -EINVAL; - } - - return 0; -} -early_param("apic", apic_set_verbosity); - -static int __init lapic_insert_resource(void) -{ - if (!apic_phys) - return -1; - - /* Put local APIC into the resource map. */ - lapic_resource.start = apic_phys; - lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; - insert_resource(&iomem_resource, &lapic_resource); - - return 0; -} - -/* - * need call insert after e820_reserve_resources() - * that is using request_resource - */ -late_initcall(lapic_insert_resource); -- cgit v1.2.3-70-g09d2 From e492c5ae85428d4a3815d06bf308c590120b928b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 24 Aug 2008 22:41:26 -0700 Subject: x86: let 64 bit to use 32 bit calibrate_apic_clock Use the 32-bit APIC calibration code - it's more mature. Signed-of-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic.c | 88 ++------------------------------------------------ 1 file changed, 2 insertions(+), 86 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 6e99af5ce67..ca5ef71f420 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -478,90 +478,6 @@ static void __cpuinit setup_APIC_timer(void) clockevents_register_device(levt); } -#ifdef CONFIG_X86_64 -/* - * In this function we calibrate APIC bus clocks to the external - * timer. Unfortunately we cannot use jiffies and the timer irq - * to calibrate, since some later bootup code depends on getting - * the first irq? Ugh. - * - * We want to do the calibration only once since we - * want to have local timer irqs syncron. CPUs connected - * by the same APIC bus have the very same bus frequency. - * And we want to have irqs off anyways, no accidental - * APIC irq that way. - */ - -#define TICK_COUNT 100000000 - -static int __init calibrate_APIC_clock(void) -{ - unsigned apic, apic_start; - unsigned long tsc, tsc_start; - int result; - - local_irq_disable(); - - /* - * Put whatever arbitrary (but long enough) timeout - * value into the APIC clock, we just want to get the - * counter running for calibration. - * - * No interrupt enable ! - */ - __setup_APIC_LVTT(250000000, 0, 0); - - apic_start = apic_read(APIC_TMCCT); -#ifdef CONFIG_X86_PM_TIMER - if (apic_calibrate_pmtmr && pmtmr_ioport) { - pmtimer_wait(5000); /* 5ms wait */ - apic = apic_read(APIC_TMCCT); - result = (apic_start - apic) * 1000L / 5; - } else -#endif - { - rdtscll(tsc_start); - - do { - apic = apic_read(APIC_TMCCT); - rdtscll(tsc); - } while ((tsc - tsc_start) < TICK_COUNT && - (apic_start - apic) < TICK_COUNT); - - result = (apic_start - apic) * 1000L * tsc_khz / - (tsc - tsc_start); - } - - local_irq_enable(); - - printk(KERN_DEBUG "APIC timer calibration result %d\n", result); - - printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", - result / 1000 / 1000, result / 1000 % 1000); - - /* Calculate the scaled math multiplication factor */ - lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC, - lapic_clockevent.shift); - lapic_clockevent.max_delta_ns = - clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); - lapic_clockevent.min_delta_ns = - clockevent_delta2ns(0xF, &lapic_clockevent); - - calibration_result = (result * APIC_DIVISOR) / HZ; - - /* - * Do a sanity check on the APIC calibration result - */ - if (calibration_result < (1000000 / HZ)) { - printk(KERN_WARNING - "APIC frequency too slow, disabling apic timer\n"); - return -1; - } - - return 0; -} - -#else /* * In this functions we calibrate APIC bus clocks to the external timer. * @@ -659,6 +575,7 @@ static int __init calibrate_APIC_clock(void) delta = lapic_cal_t1 - lapic_cal_t2; apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); +#ifdef CONFIG_X86_PM_TIMER /* Check, if the PM timer is available */ deltapm = lapic_cal_pm2 - lapic_cal_pm1; apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); @@ -687,6 +604,7 @@ static int __init calibrate_APIC_clock(void) } pm_referenced = 1; } +#endif /* Calculate the scaled math multiplication factor */ lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, @@ -773,8 +691,6 @@ static int __init calibrate_APIC_clock(void) return 0; } -#endif - /* * Setup the boot APIC * -- cgit v1.2.3-70-g09d2 From 1dd6ba2e179773597e20f17f66049a64e6c4b2ec Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 25 Aug 2008 21:27:26 +0400 Subject: x86: apic - unify smp_spurious/error_interrupt declaration According to entry_64.S we do pass pt_regs pointer into interrupt handlers but don't use them. So we safely may merge the declarations. Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic.c | 8 -------- include/asm-x86/hw_irq.h | 5 ----- 2 files changed, 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index ca5ef71f420..0556f375e40 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -1659,11 +1659,7 @@ int __init APIC_init_uniprocessor(void) /* * This interrupt should _never_ happen with our APIC/SMP architecture */ -#ifdef CONFIG_X86_64 -asmlinkage void smp_spurious_interrupt(void) -#else void smp_spurious_interrupt(struct pt_regs *regs) -#endif { u32 v; @@ -1694,11 +1690,7 @@ void smp_spurious_interrupt(struct pt_regs *regs) /* * This interrupt should never happen with our APIC/SMP architecture */ -#ifdef CONFIG_X86_64 -asmlinkage void smp_error_interrupt(void) -#else void smp_error_interrupt(struct pt_regs *regs) -#endif { u32 v, v1; diff --git a/include/asm-x86/hw_irq.h b/include/asm-x86/hw_irq.h index 39c7a4745d2..749d042f055 100644 --- a/include/asm-x86/hw_irq.h +++ b/include/asm-x86/hw_irq.h @@ -96,13 +96,8 @@ extern asmlinkage void qic_call_function_interrupt(void); /* SMP */ extern void smp_apic_timer_interrupt(struct pt_regs *); -#ifdef CONFIG_X86_32 extern void smp_spurious_interrupt(struct pt_regs *); extern void smp_error_interrupt(struct pt_regs *); -#else -extern asmlinkage void smp_spurious_interrupt(void); -extern asmlinkage void smp_error_interrupt(void); -#endif #ifdef CONFIG_X86_SMP extern void smp_reschedule_interrupt(struct pt_regs *); extern void smp_call_function_interrupt(struct pt_regs *); -- cgit v1.2.3-70-g09d2 From a11b5abef50722e42a7d13f6b799c4f606fcb797 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 3 Sep 2008 16:58:31 -0700 Subject: x2apic: fix reserved APIC register accesses in print_local_APIC() APIC_ARBPRI is a reserved register for XAPIC and beyond. APIC_RRR is a reserved register except for 82489DX, APIC for Pentium processors. APIC_EOI is a write only register. APIC_DFR is reserved in x2apic mode. Access to these registers in x2apic will result in #GP fault. Fix these apic register accesses. Signed-off-by: Yinghai Lu Signed-off-by: Suresh Siddha Cc: Maciej W. Rozycki Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 27 ++++++++++++++++++--------- include/asm-x86/apic.h | 14 ++++++++++++++ 2 files changed, 32 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index d28128e0392..93ceb19d1e9 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -1751,21 +1751,30 @@ __apicdebuginit(void) print_local_APIC(void *dummy) printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); if (APIC_INTEGRATED(ver)) { /* !82489DX */ - v = apic_read(APIC_ARBPRI); - printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, - v & APIC_ARBPRI_MASK); + if (!APIC_XAPIC(ver)) { + v = apic_read(APIC_ARBPRI); + printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, + v & APIC_ARBPRI_MASK); + } v = apic_read(APIC_PROCPRI); printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); } - v = apic_read(APIC_EOI); - printk(KERN_DEBUG "... APIC EOI: %08x\n", v); - v = apic_read(APIC_RRR); - printk(KERN_DEBUG "... APIC RRR: %08x\n", v); + /* + * Remote read supported only in the 82489DX and local APIC for + * Pentium processors. + */ + if (!APIC_INTEGRATED(ver) || maxlvt == 3) { + v = apic_read(APIC_RRR); + printk(KERN_DEBUG "... APIC RRR: %08x\n", v); + } + v = apic_read(APIC_LDR); printk(KERN_DEBUG "... APIC LDR: %08x\n", v); - v = apic_read(APIC_DFR); - printk(KERN_DEBUG "... APIC DFR: %08x\n", v); + if (!x2apic_enabled()) { + v = apic_read(APIC_DFR); + printk(KERN_DEBUG "... APIC DFR: %08x\n", v); + } v = apic_read(APIC_SPIV); printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); diff --git a/include/asm-x86/apic.h b/include/asm-x86/apic.h index 2d970f6bc2a..ef1d72dbdfe 100644 --- a/include/asm-x86/apic.h +++ b/include/asm-x86/apic.h @@ -98,6 +98,20 @@ extern void check_x2apic(void); extern void enable_x2apic(void); extern void enable_IR_x2apic(void); extern void x2apic_icr_write(u32 low, u32 id); +static inline int x2apic_enabled(void) +{ + int msr, msr2; + + if (!cpu_has_x2apic) + return 0; + + rdmsr(MSR_IA32_APICBASE, msr, msr2); + if (msr & X2APIC_ENABLE) + return 1; + return 0; +} +#else +#define x2apic_enabled() 0 #endif struct apic_ops { -- cgit v1.2.3-70-g09d2 From 02c1df199c990cd21c09a4dffaa06d4e0b7bf2bf Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Sep 2008 20:57:11 +0200 Subject: x86: print out if acpi want physical flat of all Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/genapic_flat_64.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c index 9eca5ba7a6b..2ec2de8d8c4 100644 --- a/arch/x86/kernel/genapic_flat_64.c +++ b/arch/x86/kernel/genapic_flat_64.c @@ -179,8 +179,10 @@ static int __init physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) * is an example). */ if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && - (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) + (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { + printk(KERN_DEBUG "system APIC only can use physical flat"); return 1; + } #endif return 0; -- cgit v1.2.3-70-g09d2 From 676f4a920be27160747439fe71026aa15ec78e5a Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 4 Sep 2008 22:37:49 +0400 Subject: x86: io-apic - use ARRAY_SIZE macro Make the code width a bit shorter with ARRAY_SIZE macro. Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 93ceb19d1e9..9b01fdadcb9 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -166,7 +166,7 @@ static void __init init_work(void *data) memcpy(cfg, irq_cfg_legacy, sizeof(irq_cfg_legacy)); - legacy_count = sizeof(irq_cfg_legacy)/sizeof(irq_cfg_legacy[0]); + legacy_count = ARRAY_SIZE(irq_cfg_legacy); for (i = legacy_count; i < *da->nr; i++) init_one_irq_cfg(&cfg[i]); -- cgit v1.2.3-70-g09d2 From ac54a6c9371bacb86bee1db23f7d82e8685c7e17 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 4 Sep 2008 22:37:50 +0400 Subject: x86: io-apic - declare irq_cfg_lock for SPARSE_IRQ only We use irq_cfg_lock lock in SPARSE_IRQ only context so move it under #ifdef and compiler will be happy. Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 9b01fdadcb9..d22fecf828b 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -147,14 +147,15 @@ static void init_one_irq_cfg(struct irq_cfg *cfg) static struct irq_cfg *irq_cfgx; +#ifdef CONFIG_HAVE_SPARSE_IRQ /* * Protect the irq_cfgx_free freelist: */ static DEFINE_SPINLOCK(irq_cfg_lock); -#ifdef CONFIG_HAVE_SPARSE_IRQ static struct irq_cfg *irq_cfgx_free; #endif + static void __init init_work(void *data) { struct dyn_array *da = data; -- cgit v1.2.3-70-g09d2 From b40d575bf0679c45aaf9e1161fc51a6b041b7210 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 5 Sep 2008 18:02:16 -0700 Subject: x86: HPET_MSI Refactor code in preparation for HPET_MSI Preparatory patch before the actual HPET MSI changes. Sets up hpet_set_mode and hpet_next_event for the MSI related changes. Just the code refactoring and should be zero functional change. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Shaohua Li Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index acf62fc233d..f7cb5e9e261 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -227,44 +227,44 @@ static void hpet_legacy_clockevent_register(void) printk(KERN_DEBUG "hpet clockevent registered\n"); } -static void hpet_legacy_set_mode(enum clock_event_mode mode, - struct clock_event_device *evt) +static void hpet_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt, int timer) { unsigned long cfg, cmp, now; uint64_t delta; switch(mode) { case CLOCK_EVT_MODE_PERIODIC: - delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * hpet_clockevent.mult; - delta >>= hpet_clockevent.shift; + delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult; + delta >>= evt->shift; now = hpet_readl(HPET_COUNTER); cmp = now + (unsigned long) delta; - cfg = hpet_readl(HPET_T0_CFG); + cfg = hpet_readl(HPET_Tn_CFG(timer)); cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | HPET_TN_32BIT; - hpet_writel(cfg, HPET_T0_CFG); + hpet_writel(cfg, HPET_Tn_CFG(timer)); /* * The first write after writing TN_SETVAL to the * config register sets the counter value, the second * write sets the period. */ - hpet_writel(cmp, HPET_T0_CMP); + hpet_writel(cmp, HPET_Tn_CMP(timer)); udelay(1); - hpet_writel((unsigned long) delta, HPET_T0_CMP); + hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer)); break; case CLOCK_EVT_MODE_ONESHOT: - cfg = hpet_readl(HPET_T0_CFG); + cfg = hpet_readl(HPET_Tn_CFG(timer)); cfg &= ~HPET_TN_PERIODIC; cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; - hpet_writel(cfg, HPET_T0_CFG); + hpet_writel(cfg, HPET_Tn_CFG(timer)); break; case CLOCK_EVT_MODE_UNUSED: case CLOCK_EVT_MODE_SHUTDOWN: - cfg = hpet_readl(HPET_T0_CFG); + cfg = hpet_readl(HPET_Tn_CFG(timer)); cfg &= ~HPET_TN_ENABLE; - hpet_writel(cfg, HPET_T0_CFG); + hpet_writel(cfg, HPET_Tn_CFG(timer)); break; case CLOCK_EVT_MODE_RESUME: @@ -273,14 +273,14 @@ static void hpet_legacy_set_mode(enum clock_event_mode mode, } } -static int hpet_legacy_next_event(unsigned long delta, - struct clock_event_device *evt) +static int hpet_next_event(unsigned long delta, + struct clock_event_device *evt, int timer) { u32 cnt; cnt = hpet_readl(HPET_COUNTER); cnt += (u32) delta; - hpet_writel(cnt, HPET_T0_CMP); + hpet_writel(cnt, HPET_Tn_CMP(timer)); /* * We need to read back the CMP register to make sure that @@ -292,6 +292,18 @@ static int hpet_legacy_next_event(unsigned long delta, return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; } +static void hpet_legacy_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + hpet_set_mode(mode, evt, 0); +} + +static int hpet_legacy_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + return hpet_next_event(delta, evt, 0); +} + /* * Clock source related code */ -- cgit v1.2.3-70-g09d2 From 58ac1e76ce77d515bd5cb65dbc465a040da341c6 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 5 Sep 2008 18:02:17 -0700 Subject: x86: HPET_MSI Basic HPET_MSI setup code Basic HPET MSI setup code. Routines to perform basic MSI read write in HPET memory map and setting up irq_chip for HPET MSI. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Shaohua Li Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 54 +++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/io_apic.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++ include/asm-x86/hpet.h | 21 ++++++++++++++++ 3 files changed, 139 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index f7cb5e9e261..3f10d16a834 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -6,6 +6,8 @@ #include #include #include +#include +#include #include #include @@ -25,6 +27,15 @@ unsigned long hpet_address; static void __iomem *hpet_virt_address; +struct hpet_dev { + struct clock_event_device evt; + unsigned int num; + int cpu; + unsigned int irq; + unsigned int flags; + char name[10]; +}; + unsigned long hpet_readl(unsigned long a) { return readl(hpet_virt_address + a); @@ -304,6 +315,49 @@ static int hpet_legacy_next_event(unsigned long delta, return hpet_next_event(delta, evt, 0); } +/* + * HPET MSI Support + */ + +void hpet_msi_unmask(unsigned int irq) +{ + struct hpet_dev *hdev = get_irq_data(irq); + unsigned long cfg; + + /* unmask it */ + cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); + cfg |= HPET_TN_FSB; + hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); +} + +void hpet_msi_mask(unsigned int irq) +{ + unsigned long cfg; + struct hpet_dev *hdev = get_irq_data(irq); + + /* mask it */ + cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); + cfg &= ~HPET_TN_FSB; + hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); +} + +void hpet_msi_write(unsigned int irq, struct msi_msg *msg) +{ + struct hpet_dev *hdev = get_irq_data(irq); + + hpet_writel(msg->data, HPET_Tn_ROUTE(hdev->num)); + hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hdev->num) + 4); +} + +void hpet_msi_read(unsigned int irq, struct msi_msg *msg) +{ + struct hpet_dev *hdev = get_irq_data(irq); + + msg->data = hpet_readl(HPET_Tn_ROUTE(hdev->num)); + msg->address_lo = hpet_readl(HPET_Tn_ROUTE(hdev->num) + 4); + msg->address_hi = 0; +} + /* * Clock source related code */ diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index d22fecf828b..77fa155becf 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -41,6 +41,7 @@ #endif #include #include +#include #include #include @@ -56,6 +57,7 @@ #include #include #include +#include #include #include @@ -3522,6 +3524,68 @@ int arch_setup_dmar_msi(unsigned int irq) } #endif +#ifdef CONFIG_HPET_TIMER + +#ifdef CONFIG_SMP +static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask) +{ + struct irq_cfg *cfg; + struct irq_desc *desc; + struct msi_msg msg; + unsigned int dest; + cpumask_t tmp; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + hpet_msi_read(irq, &msg); + + msg.data &= ~MSI_DATA_VECTOR_MASK; + msg.data |= MSI_DATA_VECTOR(cfg->vector); + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg.address_lo |= MSI_ADDR_DEST_ID(dest); + + hpet_msi_write(irq, &msg); + desc = irq_to_desc(irq); + desc->affinity = mask; +} +#endif /* CONFIG_SMP */ + +struct irq_chip hpet_msi_type = { + .name = "HPET_MSI", + .unmask = hpet_msi_unmask, + .mask = hpet_msi_mask, + .ack = ack_apic_edge, +#ifdef CONFIG_SMP + .set_affinity = hpet_msi_set_affinity, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +int arch_setup_hpet_msi(unsigned int irq) +{ + int ret; + struct msi_msg msg; + + ret = msi_compose_msg(NULL, irq, &msg); + if (ret < 0) + return ret; + + hpet_msi_write(irq, &msg); + set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq, + "edge"); + return 0; +} +#endif + #endif /* CONFIG_PCI_MSI */ /* * Hypertransport interrupt support diff --git a/include/asm-x86/hpet.h b/include/asm-x86/hpet.h index cbbbb6d4dd3..58b273f6ef0 100644 --- a/include/asm-x86/hpet.h +++ b/include/asm-x86/hpet.h @@ -1,6 +1,8 @@ #ifndef ASM_X86__HPET_H #define ASM_X86__HPET_H +#include + #ifdef CONFIG_HPET_TIMER #define HPET_MMAP_SIZE 1024 @@ -10,6 +12,11 @@ #define HPET_CFG 0x010 #define HPET_STATUS 0x020 #define HPET_COUNTER 0x0f0 + +#define HPET_Tn_CFG(n) (0x100 + 0x20 * n) +#define HPET_Tn_CMP(n) (0x108 + 0x20 * n) +#define HPET_Tn_ROUTE(n) (0x110 + 0x20 * n) + #define HPET_T0_CFG 0x100 #define HPET_T0_CMP 0x108 #define HPET_T0_ROUTE 0x110 @@ -65,6 +72,20 @@ extern void hpet_disable(void); extern unsigned long hpet_readl(unsigned long a); extern void force_hpet_resume(void); +extern void hpet_msi_unmask(unsigned int irq); +extern void hpet_msi_mask(unsigned int irq); +extern void hpet_msi_write(unsigned int irq, struct msi_msg *msg); +extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg); + +#ifdef CONFIG_PCI_MSI +extern int arch_setup_hpet_msi(unsigned int irq); +#else +static inline int arch_setup_hpet_msi(unsigned int irq) +{ + return -EINVAL; +} +#endif + #ifdef CONFIG_HPET_EMULATE_RTC #include -- cgit v1.2.3-70-g09d2 From 4588c1f0354ac96a358b3f9e8e4331c51cf3336f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 6 Sep 2008 14:19:17 +0200 Subject: x86: HPET_MSI Basic HPET_MSI setup code, cleanups small style cleanups. Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 50 +++++++++++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 3f10d16a834..03d3655734b 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -1,39 +1,39 @@ #include #include +#include +#include #include #include #include #include -#include -#include -#include #include +#include +#include #include -#include #include -#include +#include -#define HPET_MASK CLOCKSOURCE_MASK(32) -#define HPET_SHIFT 22 +#define HPET_MASK CLOCKSOURCE_MASK(32) +#define HPET_SHIFT 22 /* FSEC = 10^-15 NSEC = 10^-9 */ -#define FSEC_PER_NSEC 1000000L +#define FSEC_PER_NSEC 1000000L /* * HPET address is set in acpi/boot.c, when an ACPI entry exists */ -unsigned long hpet_address; -static void __iomem *hpet_virt_address; +unsigned long hpet_address; +static void __iomem *hpet_virt_address; struct hpet_dev { - struct clock_event_device evt; - unsigned int num; - int cpu; - unsigned int irq; - unsigned int flags; - char name[10]; + struct clock_event_device evt; + unsigned int num; + int cpu; + unsigned int irq; + unsigned int flags; + char name[10]; }; unsigned long hpet_readl(unsigned long a) @@ -70,7 +70,7 @@ static inline void hpet_clear_mapping(void) static int boot_hpet_disable; int hpet_force_user; -static int __init hpet_setup(char* str) +static int __init hpet_setup(char *str) { if (str) { if (!strncmp("disable", str, 7)) @@ -91,7 +91,7 @@ __setup("nohpet", disable_hpet); static inline int is_hpet_capable(void) { - return (!boot_hpet_disable && hpet_address); + return !boot_hpet_disable && hpet_address; } /* @@ -122,10 +122,10 @@ static void hpet_reserve_platform_timers(unsigned long id) nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1; - memset(&hd, 0, sizeof (hd)); - hd.hd_phys_address = hpet_address; - hd.hd_address = hpet; - hd.hd_nirqs = nrtimers; + memset(&hd, 0, sizeof(hd)); + hd.hd_phys_address = hpet_address; + hd.hd_address = hpet; + hd.hd_nirqs = nrtimers; hpet_reserve_timer(&hd, 0); #ifdef CONFIG_HPET_EMULATE_RTC @@ -141,8 +141,8 @@ static void hpet_reserve_platform_timers(unsigned long id) hd.hd_irq[1] = HPET_LEGACY_RTC; for (i = 2; i < nrtimers; timer++, i++) { - hd.hd_irq[i] = (readl(&timer->hpet_config) & Tn_INT_ROUTE_CNF_MASK) >> - Tn_INT_ROUTE_CNF_SHIFT; + hd.hd_irq[i] = (readl(&timer->hpet_config) & + Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT; } hpet_alloc(&hd); @@ -244,7 +244,7 @@ static void hpet_set_mode(enum clock_event_mode mode, unsigned long cfg, cmp, now; uint64_t delta; - switch(mode) { + switch (mode) { case CLOCK_EVT_MODE_PERIODIC: delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult; delta >>= evt->shift; -- cgit v1.2.3-70-g09d2 From 26afe5f2fbf06ea0765aaa316640c4dd472310c0 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 5 Sep 2008 18:02:18 -0700 Subject: x86: HPET_MSI Initialise per-cpu HPET timers Initialize a per CPU HPET MSI timer when possible. We retain the HPET timer 0 (IRQ 0) and timer 1 (IRQ 8) as is when legacy mode is being used. We setup the remaining HPET timers as per CPU MSI based timers. This per CPU timer will eliminate the need for timer broadcasting with IRQ 0 when there is non-functional LAPIC timer across CPU deep C-states. If there are more CPUs than number of available timers, CPUs that do not find any timer to use will continue using LAPIC and IRQ 0 broadcast. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Shaohua Li Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 295 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 293 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 03d3655734b..31e9191b7e1 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -21,10 +21,19 @@ NSEC = 10^-9 */ #define FSEC_PER_NSEC 1000000L +#define HPET_DEV_USED_BIT 2 +#define HPET_DEV_USED (1 << HPET_DEV_USED_BIT) +#define HPET_DEV_VALID 0x8 +#define HPET_DEV_FSB_CAP 0x1000 +#define HPET_DEV_PERI_CAP 0x2000 + +#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt) + /* * HPET address is set in acpi/boot.c, when an ACPI entry exists */ unsigned long hpet_address; +unsigned long hpet_num_timers; static void __iomem *hpet_virt_address; struct hpet_dev { @@ -36,6 +45,10 @@ struct hpet_dev { char name[10]; }; +static struct hpet_dev *hpet_devs; + +static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev); + unsigned long hpet_readl(unsigned long a) { return readl(hpet_virt_address + a); @@ -145,6 +158,16 @@ static void hpet_reserve_platform_timers(unsigned long id) Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT; } + for (i = 0; i < nrtimers; i++) { + struct hpet_dev *hdev = &hpet_devs[i]; + + if (!(hdev->flags & HPET_DEV_VALID)) + continue; + + hd.hd_irq[hdev->num] = hdev->irq; + hpet_reserve_timer(&hd, hdev->num); + } + hpet_alloc(&hd); } @@ -238,6 +261,8 @@ static void hpet_legacy_clockevent_register(void) printk(KERN_DEBUG "hpet clockevent registered\n"); } +static int hpet_setup_msi_irq(unsigned int irq); + static void hpet_set_mode(enum clock_event_mode mode, struct clock_event_device *evt, int timer) { @@ -279,7 +304,15 @@ static void hpet_set_mode(enum clock_event_mode mode, break; case CLOCK_EVT_MODE_RESUME: - hpet_enable_legacy_int(); + if (timer == 0) { + hpet_enable_legacy_int(); + } else { + struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); + hpet_setup_msi_irq(hdev->irq); + disable_irq(hdev->irq); + irq_set_affinity(hdev->irq, cpumask_of_cpu(hdev->cpu)); + enable_irq(hdev->irq); + } break; } } @@ -318,7 +351,7 @@ static int hpet_legacy_next_event(unsigned long delta, /* * HPET MSI Support */ - +#ifdef CONFIG_PCI_MSI void hpet_msi_unmask(unsigned int irq) { struct hpet_dev *hdev = get_irq_data(irq); @@ -358,6 +391,253 @@ void hpet_msi_read(unsigned int irq, struct msi_msg *msg) msg->address_hi = 0; } +static void hpet_msi_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); + hpet_set_mode(mode, evt, hdev->num); +} + +static int hpet_msi_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); + return hpet_next_event(delta, evt, hdev->num); +} + +static int hpet_setup_msi_irq(unsigned int irq) +{ + if (arch_setup_hpet_msi(irq)) { + destroy_irq(irq); + return -EINVAL; + } + return 0; +} + +static int hpet_assign_irq(struct hpet_dev *dev) +{ + unsigned int irq; + + irq = create_irq(); + if (!irq) + return -EINVAL; + + set_irq_data(irq, dev); + + if (hpet_setup_msi_irq(irq)) + return -EINVAL; + + dev->irq = irq; + return 0; +} + +static irqreturn_t hpet_interrupt_handler(int irq, void *data) +{ + struct hpet_dev *dev = (struct hpet_dev *)data; + struct clock_event_device *hevt = &dev->evt; + + if (!hevt->event_handler) { + printk(KERN_INFO "Spurious HPET timer interrupt on HPET timer %d\n", + dev->num); + return IRQ_HANDLED; + } + + hevt->event_handler(hevt); + return IRQ_HANDLED; +} + +static int hpet_setup_irq(struct hpet_dev *dev) +{ + + if (request_irq(dev->irq, hpet_interrupt_handler, + IRQF_SHARED|IRQF_NOBALANCING, dev->name, dev)) + return -1; + + disable_irq(dev->irq); + irq_set_affinity(dev->irq, cpumask_of_cpu(dev->cpu)); + enable_irq(dev->irq); + + return 0; +} + +/* This should be called in specific @cpu */ +static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) +{ + struct clock_event_device *evt = &hdev->evt; + uint64_t hpet_freq; + + WARN_ON(cpu != smp_processor_id()); + if (!(hdev->flags & HPET_DEV_VALID)) + return; + + if (hpet_setup_msi_irq(hdev->irq)) + return; + + hdev->cpu = cpu; + per_cpu(cpu_hpet_dev, cpu) = hdev; + evt->name = hdev->name; + hpet_setup_irq(hdev); + evt->irq = hdev->irq; + + evt->rating = 110; + evt->features = CLOCK_EVT_FEAT_ONESHOT; + if (hdev->flags & HPET_DEV_PERI_CAP) + evt->features |= CLOCK_EVT_FEAT_PERIODIC; + + evt->set_mode = hpet_msi_set_mode; + evt->set_next_event = hpet_msi_next_event; + evt->shift = 32; + + /* + * The period is a femto seconds value. We need to calculate the + * scaled math multiplication factor for nanosecond to hpet tick + * conversion. + */ + hpet_freq = 1000000000000000ULL; + do_div(hpet_freq, hpet_period); + evt->mult = div_sc((unsigned long) hpet_freq, + NSEC_PER_SEC, evt->shift); + /* Calculate the max delta */ + evt->max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, evt); + /* 5 usec minimum reprogramming delta. */ + evt->min_delta_ns = 5000; + + evt->cpumask = cpumask_of_cpu(hdev->cpu); + clockevents_register_device(evt); +} + +#ifdef CONFIG_HPET +/* Reserve at least one timer for userspace (/dev/hpet) */ +#define RESERVE_TIMERS 1 +#else +#define RESERVE_TIMERS 0 +#endif +void hpet_msi_capability_lookup(unsigned int start_timer) +{ + unsigned int id; + unsigned int num_timers; + unsigned int num_timers_used = 0; + int i; + + id = hpet_readl(HPET_ID); + + num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); + num_timers++; /* Value read out starts from 0 */ + + hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL); + if (!hpet_devs) + return; + + hpet_num_timers = num_timers; + + for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) { + struct hpet_dev *hdev = &hpet_devs[num_timers_used]; + unsigned long cfg = hpet_readl(HPET_Tn_CFG(i)); + + /* Only consider HPET timer with MSI support */ + if (!(cfg & HPET_TN_FSB_CAP)) + continue; + + hdev->flags = 0; + if (cfg & HPET_TN_PERIODIC_CAP) + hdev->flags |= HPET_DEV_PERI_CAP; + hdev->num = i; + + sprintf(hdev->name, "hpet%d", i); + if (hpet_assign_irq(hdev)) + continue; + + hdev->flags |= HPET_DEV_FSB_CAP; + hdev->flags |= HPET_DEV_VALID; + num_timers_used++; + if (num_timers_used == num_possible_cpus()) + break; + } + + printk(KERN_INFO "HPET: %d timers in total, %d timers will be used for per-cpu timer\n", + num_timers, num_timers_used); +} + +static struct hpet_dev *hpet_get_unused_timer(void) +{ + int i; + + if (!hpet_devs) + return NULL; + + for (i = 0; i < hpet_num_timers; i++) { + struct hpet_dev *hdev = &hpet_devs[i]; + + if (!(hdev->flags & HPET_DEV_VALID)) + continue; + if (test_and_set_bit(HPET_DEV_USED_BIT, + (unsigned long *)&hdev->flags)) + continue; + return hdev; + } + return NULL; +} + +struct hpet_work_struct { + struct delayed_work work; + struct completion complete; +}; + +static void hpet_work(struct work_struct *w) +{ + struct hpet_dev *hdev; + int cpu = smp_processor_id(); + struct hpet_work_struct *hpet_work; + + hpet_work = container_of(w, struct hpet_work_struct, work.work); + + hdev = hpet_get_unused_timer(); + if (hdev) + init_one_hpet_msi_clockevent(hdev, cpu); + + complete(&hpet_work->complete); +} + +static int hpet_cpuhp_notify(struct notifier_block *n, + unsigned long action, void *hcpu) +{ + unsigned long cpu = (unsigned long)hcpu; + struct hpet_work_struct work; + struct hpet_dev *hdev = per_cpu(cpu_hpet_dev, cpu); + + switch (action & 0xf) { + case CPU_ONLINE: + INIT_DELAYED_WORK(&work.work, hpet_work); + init_completion(&work.complete); + /* FIXME: add schedule_work_on() */ + schedule_delayed_work_on(cpu, &work.work, 0); + wait_for_completion(&work.complete); + break; + case CPU_DEAD: + if (hdev) { + free_irq(hdev->irq, hdev); + hdev->flags &= ~HPET_DEV_USED; + per_cpu(cpu_hpet_dev, cpu) = NULL; + } + break; + } + return NOTIFY_OK; +} +#else + +void hpet_msi_capability_lookup(unsigned int start_timer) +{ + return; +} + +static int hpet_cpuhp_notify(struct notifier_block *n, + unsigned long action, void *hcpu) +{ + return NOTIFY_OK; +} + +#endif + /* * Clock source related code */ @@ -493,8 +773,10 @@ int __init hpet_enable(void) if (id & HPET_ID_LEGSUP) { hpet_legacy_clockevent_register(); + hpet_msi_capability_lookup(2); return 1; } + hpet_msi_capability_lookup(0); return 0; out_nohpet: @@ -511,6 +793,8 @@ out_nohpet: */ static __init int hpet_late_init(void) { + int cpu; + if (boot_hpet_disable) return -ENODEV; @@ -526,6 +810,13 @@ static __init int hpet_late_init(void) hpet_reserve_platform_timers(hpet_readl(HPET_ID)); + for_each_online_cpu(cpu) { + hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu); + } + + /* This notifier should be called after workqueue is ready */ + hotcpu_notifier(hpet_cpuhp_notify, -20); + return 0; } fs_initcall(hpet_late_init); -- cgit v1.2.3-70-g09d2 From 3c2cbd2490656fb4b6ede586c557a2b09811a432 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 6 Sep 2008 14:15:33 +0400 Subject: x86: io-apic - code style cleaning for setup_IO_APIC_irqs By changing printout form we are able to shrink (and clean up) code a bit. Former printout example: init IO_APIC IRQs IO-APIC (apicid-pin) 1-1, 1-2, 1-3 not connected. IO-APIC (apicid-pin) 2-1, 2-2, 2-3 not connected. New printout example: init IO_APIC IRQs 1-1 1-2 1-3 (apicid-pin) not connected 2-1 2-2 2-3 (apicid-pin) not connected Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 53 +++++++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 77fa155becf..4040d575a21 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -1518,41 +1518,44 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, static void __init setup_IO_APIC_irqs(void) { - int apic, pin, idx, irq, first_notcon = 1; + int apic, pin, idx, irq; + int notcon = 0; apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - - idx = find_irq_entry(apic,pin,mp_INT); - if (idx == -1) { - if (first_notcon) { - apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin); - first_notcon = 0; - } else - apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin); - continue; - } - if (!first_notcon) { - apic_printk(APIC_VERBOSE, " not connected.\n"); - first_notcon = 1; - } + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + + idx = find_irq_entry(apic, pin, mp_INT); + if (idx == -1) { + apic_printk(APIC_VERBOSE, + KERN_DEBUG " %d-%d", + mp_ioapics[apic].mp_apicid, pin); + if (!notcon) + notcon = 1; + continue; + } - irq = pin_2_irq(idx, apic, pin); + irq = pin_2_irq(idx, apic, pin); #ifdef CONFIG_X86_32 - if (multi_timer_check(apic, irq)) - continue; + if (multi_timer_check(apic, irq)) + continue; #endif - add_pin_to_irq(irq, apic, pin); + add_pin_to_irq(irq, apic, pin); - setup_IO_APIC_irq(apic, pin, irq, - irq_trigger(idx), irq_polarity(idx)); - } + setup_IO_APIC_irq(apic, pin, irq, + irq_trigger(idx), irq_polarity(idx)); + } + if (notcon) { + apic_printk(APIC_VERBOSE, + KERN_DEBUG " (apicid-pin) not connected\n"); + notcon = 0; + } } - if (!first_notcon) - apic_printk(APIC_VERBOSE, " not connected.\n"); + if (notcon) + apic_printk(APIC_VERBOSE, + KERN_DEBUG " (apicid-pin) not connected\n"); } /* -- cgit v1.2.3-70-g09d2 From 79c09698cac8df16c5539447d5e1047fde9742e5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 7 Sep 2008 17:58:57 -0700 Subject: x86: lapic address print out like io apic addr Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 0556f375e40..f3f50858048 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -1548,7 +1548,7 @@ void __init init_apic_mappings(void) apic_phys = mp_lapic_addr; set_fixmap_nocache(FIX_APIC_BASE, apic_phys); - apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", + apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", APIC_BASE, apic_phys); /* -- cgit v1.2.3-70-g09d2 From 2a554fb132cf804477087057b9b0ff2162984507 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 8 Sep 2008 19:38:06 +0400 Subject: x86: io-apic - do not use KERN_DEBUG marker too much Do not use KERN_DEBUG several times on the same line being printed. Introduced by mine previous patch, sorry. Reported-by: Yinghai Lu Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 4040d575a21..12e59df026d 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -1528,11 +1528,16 @@ static void __init setup_IO_APIC_irqs(void) idx = find_irq_entry(apic, pin, mp_INT); if (idx == -1) { - apic_printk(APIC_VERBOSE, - KERN_DEBUG " %d-%d", - mp_ioapics[apic].mp_apicid, pin); - if (!notcon) + if (!notcon) { notcon = 1; + apic_printk(APIC_VERBOSE, + KERN_DEBUG " %d-%d", + mp_ioapics[apic].mp_apicid, + pin); + } else + apic_printk(APIC_VERBOSE, " %d-%d", + mp_ioapics[apic].mp_apicid, + pin); continue; } @@ -1548,14 +1553,14 @@ static void __init setup_IO_APIC_irqs(void) } if (notcon) { apic_printk(APIC_VERBOSE, - KERN_DEBUG " (apicid-pin) not connected\n"); + " (apicid-pin) not connected\n"); notcon = 0; } } if (notcon) apic_printk(APIC_VERBOSE, - KERN_DEBUG " (apicid-pin) not connected\n"); + " (apicid-pin) not connected\n"); } /* -- cgit v1.2.3-70-g09d2 From f0ed4e695faf6766927c8cfbda2bc7530c7210c2 Mon Sep 17 00:00:00 2001 From: Venki Pallipadi Date: Mon, 8 Sep 2008 10:18:40 -0700 Subject: x86: using HPET in MSI mode and setting up per CPU HPET timers, fix On Sat, Sep 06, 2008 at 06:03:53AM -0700, Ingo Molnar wrote: > > it crashes two testsystems, the fault on a NULL pointer in hpet init, > with: > > initcall print_all_ICs+0x0/0x520 returned 0 after 26 msecs > calling hpet_late_init+0x0/0x1c0 > BUG: unable to handle kernel NULL pointer dereference at 000000000000008c > IP: [] hpet_late_init+0xfe/0x1c0 > PGD 0 > Oops: 0000 [1] SMP > CPU 0 > Modules linked in: > Pid: 1, comm: swapper Not tainted 2.6.27-rc5 #29725 > RIP: 0010:[] [] hpet_late_init+0xfe/0x1c0 > RSP: 0018:ffff88003fa07dd0 EFLAGS: 00010246 > RAX: 0000000000000000 RBX: 0000000000000003 RCX: 0000000000000000 > RDX: ffffc20000000160 RSI: 0000000000000000 RDI: 0000000000000003 > RBP: ffff88003fa07e90 R08: 0000000000000000 R09: ffff88003fa07dd0 > R10: 0000000000000001 R11: 0000000000000000 R12: ffff88003fa07dd0 > R13: 0000000000000002 R14: ffffc20000000000 R15: 000000006f57e511 > FS: 0000000000000000(0000) GS:ffffffff80cf6a80(0000) knlGS:0000000000000000 > CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b > CR2: 000000000000008c CR3: 0000000000201000 CR4: 00000000000006e0 > DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 > Process swapper (pid: 1, threadinfo ffff88003fa06000, task ffff88003fa08000) > Stack: 00000000fed00000 ffffc20000000000 0000000100000003 0000000800000002 > 0000000000000000 0000000000000000 0000000000000000 0000000000000000 > 0000000000000000 0000000000000000 0000000000000000 0000000000000000 > Call Trace: > [] ? hpet_late_init+0x0/0x1c0 > [] do_one_initcall+0x45/0x190 > [] ? register_irq_proc+0x19/0xe0 > [] ? early_idt_handler+0x0/0x73 > [] kernel_init+0x14c/0x1b0 > [] ? trace_hardirqs_on_thunk+0x3a/0x3f > [] child_rip+0xa/0x11 > [] ? restore_args+0x0/0x30 > [] ? kernel_init+0x0/0x1b0 > [] ? child_rip+0x0/0x11 > Code: 20 48 83 c1 01 48 39 f1 75 e3 44 89 e8 4c 8b 05 29 29 22 00 31 f6 48 8d 78 01 66 66 90 89 f0 48 8d 04 80 48 c1 e0 05 4a 8d 0c 00 81 8c 00 00 00 08 74 26 8b 81 80 00 00 00 8b 91 88 00 00 00 > RIP [] hpet_late_init+0xfe/0x1c0 > RSP > CR2: 000000000000008c > Kernel panic - not syncing: Fatal exception There was one code path, with CONFIG_PCI_MSI disabled, where we were accessing hpet_devs without initialization. That resulted in the above crash. The change below adds a check for hpet_devs. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Shaohua Li Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 31e9191b7e1..01005aeda7d 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -126,6 +126,24 @@ EXPORT_SYMBOL_GPL(is_hpet_enabled); * timer 0 and timer 1 in case of RTC emulation. */ #ifdef CONFIG_HPET +static void hpet_reserve_msi_timers(struct hpet_data *hd) +{ + int i; + + if (!hpet_devs) + return; + + for (i = 0; i < hpet_num_timers; i++) { + struct hpet_dev *hdev = &hpet_devs[i]; + + if (!(hdev->flags & HPET_DEV_VALID)) + continue; + + hd->hd_irq[hdev->num] = hdev->irq; + hpet_reserve_timer(hd, hdev->num); + } +} + static void hpet_reserve_platform_timers(unsigned long id) { struct hpet __iomem *hpet = hpet_virt_address; @@ -158,15 +176,7 @@ static void hpet_reserve_platform_timers(unsigned long id) Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT; } - for (i = 0; i < nrtimers; i++) { - struct hpet_dev *hdev = &hpet_devs[i]; - - if (!(hdev->flags & HPET_DEV_VALID)) - continue; - - hd.hd_irq[hdev->num] = hdev->irq; - hpet_reserve_timer(&hd, hdev->num); - } + hpet_reserve_msi_timers(&hd); hpet_alloc(&hd); -- cgit v1.2.3-70-g09d2 From ba374c9baef910fbc5373541d98c50f15e82c3f8 Mon Sep 17 00:00:00 2001 From: Steven Noonan Date: Mon, 8 Sep 2008 16:19:09 -0700 Subject: x86: fix HPET compiler error when not using CONFIG_PCI_MSI Added dummy function for hpet_setup_msi_irq(). Signed-off-by: Steven Noonan Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 01005aeda7d..422c577ef69 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -635,6 +635,10 @@ static int hpet_cpuhp_notify(struct notifier_block *n, } #else +static int hpet_setup_msi_irq(unsigned int irq) +{ + return 0; +} void hpet_msi_capability_lookup(unsigned int start_timer) { return; -- cgit v1.2.3-70-g09d2 From 87783be4c26d79f5cde245e6e8a9fd2b295e92d7 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 10 Sep 2008 22:19:50 +0400 Subject: x86: io-apic - get rid of __DO_ACTION macro Replace __DO_ACTION macro with io_apic_modify_irq function. This allow us to 'grep' definitions being hided by __DO_ACTION macro: __unmask_IO_APIC_irq __mask_IO_APIC_irq __mask_and_edge_IO_APIC_irq __unmask_and_level_IO_APIC_irq Signed-off-by: Cyrill Gorcunov Acked-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 103 +++++++++++++++++++++++----------------------- 1 file changed, 52 insertions(+), 51 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 12e59df026d..ac47384724e 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -643,65 +643,66 @@ static void __init replace_pin_at_irq(unsigned int irq, add_pin_to_irq(irq, newapic, newpin); } -#define __DO_ACTION(R, ACTION_ENABLE, ACTION_DISABLE, FINAL) \ - \ -{ \ - int pin; \ - struct irq_cfg *cfg; \ - struct irq_pin_list *entry; \ - \ - cfg = irq_cfg(irq); \ - entry = cfg->irq_2_pin; \ - for (;;) { \ - unsigned int reg; \ - if (!entry) \ - break; \ - pin = entry->pin; \ - reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ - reg ACTION_DISABLE; \ - reg ACTION_ENABLE; \ - io_apic_modify(entry->apic, 0x10 + R + pin*2, reg); \ - FINAL; \ - if (!entry->next) \ - break; \ - entry = entry->next; \ - } \ -} - -#define DO_ACTION(name,R, ACTION_ENABLE, ACTION_DISABLE, FINAL) \ - \ - static void name##_IO_APIC_irq (unsigned int irq) \ - __DO_ACTION(R, ACTION_ENABLE, ACTION_DISABLE, FINAL) - -/* mask = 0 */ -DO_ACTION(__unmask, 0, |= 0, &= ~IO_APIC_REDIR_MASKED, ) +static inline void io_apic_modify_irq(unsigned int irq, + int mask_and, int mask_or, + void (*final)(struct irq_pin_list *entry)) +{ + int pin; + struct irq_cfg *cfg; + struct irq_pin_list *entry; + + cfg = irq_cfg(irq); + for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { + unsigned int reg; + pin = entry->pin; + reg = io_apic_read(entry->apic, 0x10 + pin * 2); + reg &= mask_and; + reg |= mask_or; + io_apic_modify(entry->apic, 0x10 + pin * 2, reg); + if (final) + final(entry); + } +} + +static void __unmask_IO_APIC_irq(unsigned int irq) +{ + io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL); +} #ifdef CONFIG_X86_64 -/* - * Synchronize the IO-APIC and the CPU by doing - * a dummy read from the IO-APIC - */ -static inline void io_apic_sync(unsigned int apic) +void io_apic_sync(struct irq_pin_list *entry) { - struct io_apic __iomem *io_apic = io_apic_base(apic); + /* + * Synchronize the IO-APIC and the CPU by doing + * a dummy read from the IO-APIC + */ + struct io_apic __iomem *io_apic; + io_apic = io_apic_base(entry->apic); readl(&io_apic->data); } -/* mask = 1 */ -DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, &= ~0, io_apic_sync(entry->apic)) - -#else - -/* mask = 1 */ -DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, &= ~0, ) - -/* mask = 1, trigger = 0 */ -DO_ACTION(__mask_and_edge, 0, |= IO_APIC_REDIR_MASKED, &= ~IO_APIC_REDIR_LEVEL_TRIGGER, ) +static void __mask_IO_APIC_irq(unsigned int irq) +{ + io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); +} +#else /* CONFIG_X86_32 */ +static void __mask_IO_APIC_irq(unsigned int irq) +{ + io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL); +} -/* mask = 0, trigger = 1 */ -DO_ACTION(__unmask_and_level, 0, |= IO_APIC_REDIR_LEVEL_TRIGGER, &= ~IO_APIC_REDIR_MASKED, ) +static void __mask_and_edge_IO_APIC_irq(unsigned int irq) +{ + io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER, + IO_APIC_REDIR_MASKED, NULL); +} -#endif +static void __unmask_and_level_IO_APIC_irq(unsigned int irq) +{ + io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, + IO_APIC_REDIR_LEVEL_TRIGGER, NULL); +} +#endif /* CONFIG_X86_32 */ static void mask_IO_APIC_irq (unsigned int irq) { -- cgit v1.2.3-70-g09d2 From 823b259b80158a5fb694f6784e18b5bae669c599 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Sep 2008 21:56:46 -0700 Subject: x86: print out apic id in hex format Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic.c | 2 +- arch/x86/kernel/cpu/amd.c | 2 +- arch/x86/kernel/cpu/intel.c | 2 +- arch/x86/kernel/smpboot.c | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index f3f50858048..11cb1863958 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -1586,7 +1586,7 @@ int __init APIC_init_uniprocessor(void) */ if (!cpu_has_apic && APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { - printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", + printk(KERN_ERR "BIOS bug, local APIC 0x%x not detected!...\n", boot_cpu_physical_apicid); clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); return -1; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 32e73520adf..8f1e31db2ad 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -249,7 +249,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) } numa_set_node(cpu, node); - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); + printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node); #endif } diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 99468dbd08d..cce0b6118d5 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -174,7 +174,7 @@ static void __cpuinit srat_detect_node(void) node = first_node(node_online_map); numa_set_node(cpu, node); - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); + printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node); #endif } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8c3aca7cb34..f84de2ff933 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -543,10 +543,10 @@ static inline void __inquire_remote_apic(int apicid) int timeout; u32 status; - printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid); + printk(KERN_INFO "Inquiring remote APIC 0x%x...\n", apicid); for (i = 0; i < ARRAY_SIZE(regs); i++) { - printk(KERN_INFO "... APIC #%d %s: ", apicid, names[i]); + printk(KERN_INFO "... APIC 0x%x %s: ", apicid, names[i]); /* * Wait for idle. @@ -874,7 +874,7 @@ do_rest: start_ip = setup_trampoline(); /* So we see what's up */ - printk(KERN_INFO "Booting processor %d/%d ip %lx\n", + printk(KERN_INFO "Booting processor %d APIC 0x%x ip 0x%lx\n", cpu, apicid, start_ip); /* -- cgit v1.2.3-70-g09d2 From 9df08f109572e88fbdab9a61d5cb0f0eae4ac9fa Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 14 Sep 2008 11:55:37 +0400 Subject: x86: apic - lapic_setup_esr does not handle esr_disable - fix it lapic_setup_esr doesn't handle esr_disable inquire. The error brought in during unification process. Fix it. Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic.c | 63 ++++++++++++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 11cb1863958..59550cc017d 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -1081,40 +1081,43 @@ void __init init_bsp_APIC(void) static void __cpuinit lapic_setup_esr(void) { - unsigned long oldvalue, value, maxlvt; - if (lapic_is_integrated() && !esr_disable) { - if (esr_disable) { - /* - * Something untraceable is creating bad interrupts on - * secondary quads ... for the moment, just leave the - * ESR disabled - we can't do anything useful with the - * errors anyway - mbligh - */ - printk(KERN_INFO "Leaving ESR disabled.\n"); - return; - } - /* !82489DX */ - maxlvt = lapic_get_maxlvt(); - if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ - apic_write(APIC_ESR, 0); - oldvalue = apic_read(APIC_ESR); + unsigned int oldvalue, value, maxlvt; + + if (!lapic_is_integrated()) { + printk(KERN_INFO "No ESR for 82489DX.\n"); + return; + } - /* enables sending errors */ - value = ERROR_APIC_VECTOR; - apic_write(APIC_LVTERR, value); + if (esr_disable) { /* - * spec says clear errors after enabling vector. + * Something untraceable is creating bad interrupts on + * secondary quads ... for the moment, just leave the + * ESR disabled - we can't do anything useful with the + * errors anyway - mbligh */ - if (maxlvt > 3) - apic_write(APIC_ESR, 0); - value = apic_read(APIC_ESR); - if (value != oldvalue) - apic_printk(APIC_VERBOSE, "ESR value before enabling " - "vector: 0x%08lx after: 0x%08lx\n", - oldvalue, value); - } else { - printk(KERN_INFO "No ESR for 82489DX.\n"); + printk(KERN_INFO "Leaving ESR disabled.\n"); + return; } + + maxlvt = lapic_get_maxlvt(); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + oldvalue = apic_read(APIC_ESR); + + /* enables sending errors */ + value = ERROR_APIC_VECTOR; + apic_write(APIC_LVTERR, value); + + /* + * spec says clear errors after enabling vector. + */ + if (maxlvt > 3) + apic_write(APIC_ESR, 0); + value = apic_read(APIC_ESR); + if (value != oldvalue) + apic_printk(APIC_VERBOSE, "ESR value before enabling " + "vector: 0x%08x after: 0x%08x\n", + oldvalue, value); } -- cgit v1.2.3-70-g09d2 From 08ad776e3c54e6fe8b50939f176538063b69f89e Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 14 Sep 2008 11:55:38 +0400 Subject: x86: apic - skip writting ESR register if we dont have on On 82489DX we don't have ESR register so we should not write it. Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 59550cc017d..d730e8d3172 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -1131,7 +1131,7 @@ void __cpuinit setup_local_APIC(void) #ifdef CONFIG_X86_32 /* Pound the ESR really hard over the head with a big hammer - mbligh */ - if (esr_disable) { + if (lapic_is_integrated() && esr_disable) { apic_write(APIC_ESR, 0); apic_write(APIC_ESR, 0); apic_write(APIC_ESR, 0); -- cgit v1.2.3-70-g09d2 From b189892de4da4634560657aedd774752094dbfa0 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Fri, 12 Sep 2008 23:58:24 +0400 Subject: x86: apic - fix unused vars warning in calibrate_APIC_clock If we don't have CONFIG_X86_PM_TIMER=y compiler warns about unused variables. Move PM timer based calibration into a separate function and make the code cleaner and the compiler happy as well. Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic.c | 81 +++++++++++++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index d730e8d3172..784116933c7 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -538,14 +538,51 @@ static void __init lapic_cal_handler(struct clock_event_device *dev) } } +static int __init calibrate_by_pmtimer(long deltapm, long *delta) +{ + const long pm_100ms = PMTMR_TICKS_PER_SEC / 10; + const long pm_thresh = pm_100ms / 100; + unsigned long mult; + u64 res; + +#ifndef CONFIG_X86_PM_TIMER + return -1; +#endif + + apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); + + /* Check, if the PM timer is available */ + if (!deltapm) + return -1; + + mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22); + + if (deltapm > (pm_100ms - pm_thresh) && + deltapm < (pm_100ms + pm_thresh)) { + apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); + } else { + res = (((u64)deltapm) * mult) >> 22; + do_div(res, 1000000); + printk(KERN_WARNING "APIC calibration not consistent " + "with PM Timer: %ldms instead of 100ms\n", + (long)res); + /* Correct the lapic counter value */ + res = (((u64)(*delta)) * pm_100ms); + do_div(res, deltapm); + printk(KERN_INFO "APIC delta adjusted to PM-Timer: " + "%lu (%ld)\n", (unsigned long)res, *delta); + *delta = (long)res; + } + + return 0; +} + static int __init calibrate_APIC_clock(void) { struct clock_event_device *levt = &__get_cpu_var(lapic_events); - const long pm_100ms = PMTMR_TICKS_PER_SEC/10; - const long pm_thresh = pm_100ms/100; void (*real_handler)(struct clock_event_device *dev); unsigned long deltaj; - long delta, deltapm; + long delta; int pm_referenced = 0; local_irq_disable(); @@ -575,36 +612,9 @@ static int __init calibrate_APIC_clock(void) delta = lapic_cal_t1 - lapic_cal_t2; apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); -#ifdef CONFIG_X86_PM_TIMER - /* Check, if the PM timer is available */ - deltapm = lapic_cal_pm2 - lapic_cal_pm1; - apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); - - if (deltapm) { - unsigned long mult; - u64 res; - - mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22); - - if (deltapm > (pm_100ms - pm_thresh) && - deltapm < (pm_100ms + pm_thresh)) { - apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); - } else { - res = (((u64) deltapm) * mult) >> 22; - do_div(res, 1000000); - printk(KERN_WARNING "APIC calibration not consistent " - "with PM Timer: %ldms instead of 100ms\n", - (long)res); - /* Correct the lapic counter value */ - res = (((u64) delta) * pm_100ms); - do_div(res, deltapm); - printk(KERN_INFO "APIC delta adjusted to PM-Timer: " - "%lu (%ld)\n", (unsigned long) res, delta); - delta = (long) res; - } - pm_referenced = 1; - } -#endif + /* we trust the PM based calibration if possible */ + pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1, + &delta); /* Calculate the scaled math multiplication factor */ lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, @@ -646,7 +656,10 @@ static int __init calibrate_APIC_clock(void) levt->features &= ~CLOCK_EVT_FEAT_DUMMY; - /* We trust the pm timer based calibration */ + /* + * PM timer calibration failed or not turned on + * so lets try APIC timer based calibration + */ if (!pm_referenced) { apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); -- cgit v1.2.3-70-g09d2 From 56ffa1a028b9fce3860a247c6fe79fce7cbf425b Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 13 Sep 2008 13:11:16 +0400 Subject: x86: io-apic - do not use KERN_DEBUG marker too much, fix Yinghai Lu reported: | > 0 add_pin_to_irq: irq 15 --> apic 0 pin 15 | > IOAPIC[0]: Set routing entry (8-15 -> 0x3f -> IRQ 15 Mode:0 Active:0) | > 8-16 8-17 8-18 8-19 8-20 8-21 8-22 8-23 (apicid-pin) not connected | > 9-0 9-1 9-2 9-3 9-4 9-5 9-6 9-7 9-8 9-9 9-10 9-11 9-12 9-13 9-14 9-15 | > 9-16 9-17 9-18 9-19 9-20 9-21 9-22 9-23 (apicid-pin) not connected | > | | only first one not connected at first, and ... here is a quick fix for this. Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index ac47384724e..6ce5873a406 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -1541,6 +1541,11 @@ static void __init setup_IO_APIC_irqs(void) pin); continue; } + if (notcon) { + apic_printk(APIC_VERBOSE, + " (apicid-pin) not connected\n"); + notcon = 0; + } irq = pin_2_irq(idx, apic, pin); #ifdef CONFIG_X86_32 @@ -1552,11 +1557,6 @@ static void __init setup_IO_APIC_irqs(void) setup_IO_APIC_irq(apic, pin, irq, irq_trigger(idx), irq_polarity(idx)); } - if (notcon) { - apic_printk(APIC_VERBOSE, - " (apicid-pin) not connected\n"); - notcon = 0; - } } if (notcon) -- cgit v1.2.3-70-g09d2 From 9d98598d2fc286c8dbcd0b681168639528428db0 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 19 Sep 2008 00:12:26 -0700 Subject: sparseirq: remove some debug print out Signed-off-by: Yinghai Lu Cc: Yinghai Lu Cc: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 19 ------------------- kernel/irq/handle.c | 18 ------------------ 2 files changed, 37 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 6ce5873a406..01419fdc1d5 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -277,23 +277,6 @@ static struct irq_cfg *irq_cfg_alloc(unsigned int irq) spin_unlock_irqrestore(&irq_cfg_lock, flags); - printk(KERN_DEBUG "found new irq_cfg for irq %d\n", cfg->irq); -#ifdef CONFIG_HAVE_SPARSE_IRQ_DEBUG - { - /* dump the results */ - struct irq_cfg *cfg; - unsigned long phys; - unsigned long bytes = sizeof(struct irq_cfg); - - printk(KERN_DEBUG "=========================== %d\n", irq); - printk(KERN_DEBUG "irq_cfg dump after get that for %d\n", irq); - for_each_irq_cfg(cfg) { - phys = __pa(cfg); - printk(KERN_DEBUG "irq_cfg %d ==> [%#lx - %#lx]\n", cfg->irq, phys, phys + bytes); - } - printk(KERN_DEBUG "===========================\n"); - } -#endif return cfg; } #else @@ -597,7 +580,6 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin) cfg->irq_2_pin = entry; entry->apic = apic; entry->pin = pin; - printk(KERN_DEBUG " 0 add_pin_to_irq: irq %d --> apic %d pin %d\n", irq, apic, pin); return; } @@ -613,7 +595,6 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin) entry = entry->next; entry->apic = apic; entry->pin = pin; - printk(KERN_DEBUG " x add_pin_to_irq: irq %d --> apic %d pin %d\n", irq, apic, pin); } /* diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index eae69373a9c..710db0ec97e 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -238,24 +238,6 @@ struct irq_desc *irq_to_desc_alloc(unsigned int irq) spin_unlock_irqrestore(&sparse_irq_lock, flags); - printk(KERN_DEBUG "found new irq_desc for irq %d\n", desc->irq); -#ifdef CONFIG_HAVE_SPARSE_IRQ_DEBUG - { - /* dump the results */ - struct irq_desc *desc; - unsigned long phys; - unsigned long bytes = sizeof(struct irq_desc); - unsigned int irqx; - - printk(KERN_DEBUG "=========================== %d\n", irq); - printk(KERN_DEBUG "irq_desc dump after get that for %d\n", irq); - for_each_irq_desc(irqx, desc) { - phys = __pa(desc); - printk(KERN_DEBUG "irq_desc %d ==> [%#lx - %#lx]\n", irqx, phys, phys + bytes); - } - printk(KERN_DEBUG "===========================\n"); - } -#endif return desc; } #else -- cgit v1.2.3-70-g09d2 From 5ffa4eb2224343ec3fbd492ffe71e28e797435b7 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 18 Sep 2008 23:37:57 +0400 Subject: x86: io-apic - interrupt remapping fix Interrupt remapping could lead to NULL dereference in case of kzalloc failed and memory leak in other way. So fix the both cases. Signed-off-by: Cyrill Gorcunov Cc: "Maciej W. Rozycki" Cc: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic.c | 13 ++++++++++--- arch/x86/kernel/io_apic.c | 19 +++++++++++++++++-- 2 files changed, 27 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 784116933c7..1f48bd1c9f2 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -1360,7 +1360,12 @@ void enable_IR_x2apic(void) local_irq_save(flags); mask_8259A(); - save_mask_IO_APIC_setup(); + + ret = save_mask_IO_APIC_setup(); + if (ret) { + printk(KERN_INFO "Saving IO-APIC state failed: %d\n", ret); + goto end; + } ret = enable_intr_remapping(1); @@ -1370,14 +1375,15 @@ void enable_IR_x2apic(void) } if (ret) - goto end; + goto end_restore; if (!x2apic) { x2apic = 1; apic_ops = &x2apic_ops; enable_x2apic(); } -end: + +end_restore: if (ret) /* * IR enabling failed @@ -1386,6 +1392,7 @@ end: else reinit_intr_remapped_IO_APIC(x2apic_preenabled); +end: unmask_8259A(); local_irq_restore(flags); diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 01419fdc1d5..4cc9cb64c81 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -812,7 +812,7 @@ int save_mask_IO_APIC_setup(void) kzalloc(sizeof(struct IO_APIC_route_entry) * nr_ioapic_registers[apic], GFP_KERNEL); if (!early_ioapic_entries[apic]) - return -ENOMEM; + goto nomem; } for (apic = 0; apic < nr_ioapics; apic++) @@ -826,17 +826,32 @@ int save_mask_IO_APIC_setup(void) ioapic_write_entry(apic, pin, entry); } } + return 0; + +nomem: + for (; apic > 0; apic--) + kfree(early_ioapic_entries[apic]); + kfree(early_ioapic_entries[apic]); + memset(early_ioapic_entries, 0, + ARRAY_SIZE(early_ioapic_entries)); + + return -ENOMEM; } void restore_IO_APIC_setup(void) { int apic, pin; - for (apic = 0; apic < nr_ioapics; apic++) + for (apic = 0; apic < nr_ioapics; apic++) { + if (!early_ioapic_entries[apic]) + break; for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) ioapic_write_entry(apic, pin, early_ioapic_entries[apic][pin]); + kfree(early_ioapic_entries[apic]); + early_ioapic_entries[apic] = NULL; + } } void reinit_intr_remapped_IO_APIC(int intr_remapping) -- cgit v1.2.3-70-g09d2 From 8da077d6f31da291ee3a7dd559671cb8ca48cbe2 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Tue, 23 Sep 2008 08:42:05 -0500 Subject: x86, uv: fix ordering of calls to uv_system_init & uv_cpu_init Fix problem caused by reordering of the calls to uv_cpu_init() & uv_system_init. Originally, uv_cpu_init() was called AFTER uv_system_init. This order was recently broken as a side-effect of other patches. With this patch, initialization of cpu 0 is now done by the system_init call. Signed-off-by: Jack Steiner Signed-off-by: Ingo Molnar --- arch/x86/kernel/genx2apic_uv_x.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index 33581d94a90..b689075bbe2 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -356,7 +356,22 @@ static __init void uv_rtc_init(void) sn_rtc_cycles_per_second = ticks_per_sec; } -static bool uv_system_inited; +/* + * Called on each cpu to initialize the per_cpu UV data area. + * ZZZ hotplug not supported yet + */ +void __cpuinit uv_cpu_init(void) +{ + /* CPU 0 initilization will be done via uv_system_init. */ + if (!uv_blade_info) + return; + + uv_blade_info[uv_numa_blade_id()].nr_online_cpus++; + + if (get_uv_system_type() == UV_NON_UNIQUE_APIC) + set_x2apic_extra_bits(uv_hub_info->pnode); +} + void __init uv_system_init(void) { @@ -448,21 +463,6 @@ void __init uv_system_init(void) map_mmr_high(max_pnode); map_config_high(max_pnode); map_mmioh_high(max_pnode); - uv_system_inited = true; -} -/* - * Called on each cpu to initialize the per_cpu UV data area. - * ZZZ hotplug not supported yet - */ -void __cpuinit uv_cpu_init(void) -{ - BUG_ON(!uv_system_inited); - - uv_blade_info[uv_numa_blade_id()].nr_online_cpus++; - - if (get_uv_system_type() == UV_NON_UNIQUE_APIC) - set_x2apic_extra_bits(uv_hub_info->pnode); + uv_cpu_init(); } - - -- cgit v1.2.3-70-g09d2 From 7564676813780e0ba4dacf95338202cb72d2172d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 24 Sep 2008 19:04:36 -0700 Subject: x86: irq no should not use hex in /proc/interrupts Arjan van de Ven noticed that we changed IRQ numbers from decimal to hex in /proc/interrupts - that can break user-space utilities like irqbalanced. Reported-by: Arjan van de Ven Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq_32.c | 2 +- arch/x86/kernel/irq_64.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index b2e1082cf5a..001772ffc91 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -307,7 +307,7 @@ int show_interrupts(struct seq_file *p, void *v) action = desc->action; if (!action && !any_count) goto skip; - seq_printf(p, "%#x: ",i); + seq_printf(p, "%3d: ", i); #ifndef CONFIG_SMP seq_printf(p, "%10u ", kstat_irqs(i)); #else diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index c2fca89a3f7..ec266109128 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -112,7 +112,7 @@ int show_interrupts(struct seq_file *p, void *v) action = desc->action; if (!action && !any_count) goto skip; - seq_printf(p, "%#x: ",i); + seq_printf(p, "%3d: ", i); #ifndef CONFIG_SMP seq_printf(p, "%10u ", kstat_irqs(i)); #else -- cgit v1.2.3-70-g09d2 From c1370b49cc4f09de5b447ddf688a3988a297dbfc Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 23 Sep 2008 23:00:02 +0400 Subject: x86: io-apic - interrupt remapping fix Clean up obscure for() cycle with straight while() form Signed-off-by: Cyrill Gorcunov Cc: "Maciej W. Rozycki" Acked-by: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 4cc9cb64c81..ed68a6f99dc 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -830,9 +830,8 @@ int save_mask_IO_APIC_setup(void) return 0; nomem: - for (; apic > 0; apic--) - kfree(early_ioapic_entries[apic]); - kfree(early_ioapic_entries[apic]); + while (apic >= 0) + kfree(early_ioapic_entries[apic--]); memset(early_ioapic_entries, 0, ARRAY_SIZE(early_ioapic_entries)); -- cgit v1.2.3-70-g09d2 From c81bba49a13cb3376654d0cc93dc069fd619ed76 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 25 Sep 2008 11:53:11 -0700 Subject: x86: print out irq nr for msi/ht, v3 v2: fix hpet compiling error v3: Bjorn want to use dev_printk instead Signed-off-by: Yinghai Lu Cc: Bjorn Helgaas Cc: Jesse Barnes Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 3 +++ arch/x86/kernel/io_apic.c | 5 +++++ 2 files changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 422c577ef69..2913913f4a4 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -467,6 +467,9 @@ static int hpet_setup_irq(struct hpet_dev *dev) irq_set_affinity(dev->irq, cpumask_of_cpu(dev->cpu)); enable_irq(dev->irq); + printk(KERN_DEBUG "hpet: %s irq %d for MSI\n", + dev->name, dev->irq); + return 0; } diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index ed68a6f99dc..4ee270d3035 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -3354,6 +3354,8 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) #endif set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); + dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq); + return 0; } @@ -3586,6 +3588,7 @@ int arch_setup_hpet_msi(unsigned int irq) hpet_msi_write(irq, &msg); set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq, "edge"); + return 0; } #endif @@ -3682,6 +3685,8 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) set_irq_chip_and_handler_name(irq, &ht_irq_chip, handle_edge_irq, "edge"); + + dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); } return err; } -- cgit v1.2.3-70-g09d2 From 5f79f2f2ad39b5177c52ed08ffd066ea0c1da924 Mon Sep 17 00:00:00 2001 From: Venki Pallipadi Date: Wed, 24 Sep 2008 10:03:17 -0700 Subject: hpet: clean up warning Fix the below compile warnings due to recent HPET MSI changes arch/x86/kernel/hpet.c:48: warning: 'hpet_devs' defined but not used arch/x86/kernel/hpet.c:50: warning: 'per_cpu__cpu_hpet_dev' defined but not used Reported-by: Andrew Morton Signed-off-by: Venkatesh Pallipadi Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 57 +++++++++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 2913913f4a4..77017e834cf 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -45,10 +45,6 @@ struct hpet_dev { char name[10]; }; -static struct hpet_dev *hpet_devs; - -static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev); - unsigned long hpet_readl(unsigned long a) { return readl(hpet_virt_address + a); @@ -126,23 +122,8 @@ EXPORT_SYMBOL_GPL(is_hpet_enabled); * timer 0 and timer 1 in case of RTC emulation. */ #ifdef CONFIG_HPET -static void hpet_reserve_msi_timers(struct hpet_data *hd) -{ - int i; - if (!hpet_devs) - return; - - for (i = 0; i < hpet_num_timers; i++) { - struct hpet_dev *hdev = &hpet_devs[i]; - - if (!(hdev->flags & HPET_DEV_VALID)) - continue; - - hd->hd_irq[hdev->num] = hdev->irq; - hpet_reserve_timer(hd, hdev->num); - } -} +static void hpet_reserve_msi_timers(struct hpet_data *hd); static void hpet_reserve_platform_timers(unsigned long id) { @@ -362,6 +343,10 @@ static int hpet_legacy_next_event(unsigned long delta, * HPET MSI Support */ #ifdef CONFIG_PCI_MSI + +static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev); +static struct hpet_dev *hpet_devs; + void hpet_msi_unmask(unsigned int irq) { struct hpet_dev *hdev = get_irq_data(irq); @@ -525,7 +510,8 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) #else #define RESERVE_TIMERS 0 #endif -void hpet_msi_capability_lookup(unsigned int start_timer) + +static void hpet_msi_capability_lookup(unsigned int start_timer) { unsigned int id; unsigned int num_timers; @@ -571,6 +557,26 @@ void hpet_msi_capability_lookup(unsigned int start_timer) num_timers, num_timers_used); } +#ifdef CONFIG_HPET +static void hpet_reserve_msi_timers(struct hpet_data *hd) +{ + int i; + + if (!hpet_devs) + return; + + for (i = 0; i < hpet_num_timers; i++) { + struct hpet_dev *hdev = &hpet_devs[i]; + + if (!(hdev->flags & HPET_DEV_VALID)) + continue; + + hd->hd_irq[hdev->num] = hdev->irq; + hpet_reserve_timer(hd, hdev->num); + } +} +#endif + static struct hpet_dev *hpet_get_unused_timer(void) { int i; @@ -642,10 +648,17 @@ static int hpet_setup_msi_irq(unsigned int irq) { return 0; } -void hpet_msi_capability_lookup(unsigned int start_timer) +static void hpet_msi_capability_lookup(unsigned int start_timer) +{ + return; +} + +#ifdef CONFIG_HPET +static void hpet_reserve_msi_timers(struct hpet_data *hd) { return; } +#endif static int hpet_cpuhp_notify(struct notifier_block *n, unsigned long action, void *hcpu) -- cgit v1.2.3-70-g09d2 From 4173a0e7371ece227559b44943c6fd456ee470d1 Mon Sep 17 00:00:00 2001 From: Dean Nelson Date: Thu, 2 Oct 2008 12:18:21 -0500 Subject: x86, UV: add uv_setup_irq() and uv_teardown_irq() functions, v3 Provide a means for UV interrupt MMRs to be setup with the message to be sent when an MSI is raised. Signed-off-by: Dean Nelson Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/io_apic.c | 68 +++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/uv_irq.c | 77 +++++++++++++++++++++++++++++++++++++++++++++ include/asm-x86/uv/uv_irq.h | 36 +++++++++++++++++++++ 4 files changed, 182 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/uv_irq.c create mode 100644 include/asm-x86/uv/uv_irq.h (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 45cecc59d89..acc0e8bf043 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -108,7 +108,7 @@ obj-$(CONFIG_MICROCODE) += microcode.o # 64 bit specific files ifeq ($(CONFIG_X86_64),y) obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o - obj-y += bios_uv.o + obj-y += bios_uv.o uv_irq.o obj-y += genx2apic_cluster.o obj-y += genx2apic_phys.o obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 4ee270d3035..260c95a5e6d 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -58,6 +58,8 @@ #include #include #include +#include +#include #include #include @@ -3692,6 +3694,72 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) } #endif /* CONFIG_HT_IRQ */ +#ifdef CONFIG_X86_64 +/* + * Re-target the irq to the specified CPU and enable the specified MMR located + * on the specified blade to allow the sending of MSIs to the specified CPU. + */ +int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, + unsigned long mmr_offset) +{ + const cpumask_t *eligible_cpu = get_cpu_mask(cpu); + struct irq_cfg *cfg; + int mmr_pnode; + unsigned long mmr_value; + struct uv_IO_APIC_route_entry *entry; + unsigned long flags; + int err; + + err = assign_irq_vector(irq, *eligible_cpu); + if (err != 0) + return err; + + spin_lock_irqsave(&vector_lock, flags); + set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, + irq_name); + spin_unlock_irqrestore(&vector_lock, flags); + + cfg = irq_cfg(irq); + + mmr_value = 0; + entry = (struct uv_IO_APIC_route_entry *)&mmr_value; + BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); + + entry->vector = cfg->vector; + entry->delivery_mode = INT_DELIVERY_MODE; + entry->dest_mode = INT_DEST_MODE; + entry->polarity = 0; + entry->trigger = 0; + entry->mask = 0; + entry->dest = cpu_mask_to_apicid(*eligible_cpu); + + mmr_pnode = uv_blade_to_pnode(mmr_blade); + uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); + + return irq; +} + +/* + * Disable the specified MMR located on the specified blade so that MSIs are + * longer allowed to be sent. + */ +void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset) +{ + unsigned long mmr_value; + struct uv_IO_APIC_route_entry *entry; + int mmr_pnode; + + mmr_value = 0; + entry = (struct uv_IO_APIC_route_entry *)&mmr_value; + BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); + + entry->mask = 1; + + mmr_pnode = uv_blade_to_pnode(mmr_blade); + uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); +} +#endif /* CONFIG_X86_64 */ + int __init io_apic_get_redir_entries (int ioapic) { union IO_APIC_reg_01 reg_01; diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c new file mode 100644 index 00000000000..6bd26c91c30 --- /dev/null +++ b/arch/x86/kernel/uv_irq.c @@ -0,0 +1,77 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * SGI UV IRQ functions + * + * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved. + */ + +#include +#include +#include + +static void uv_noop(unsigned int irq) +{ +} + +static unsigned int uv_noop_ret(unsigned int irq) +{ + return 0; +} + +static void uv_ack_apic(unsigned int irq) +{ + ack_APIC_irq(); +} + +struct irq_chip uv_irq_chip = { + .name = "UV-CORE", + .startup = uv_noop_ret, + .shutdown = uv_noop, + .enable = uv_noop, + .disable = uv_noop, + .ack = uv_noop, + .mask = uv_noop, + .unmask = uv_noop, + .eoi = uv_ack_apic, + .end = uv_noop, +}; + +/* + * Set up a mapping of an available irq and vector, and enable the specified + * MMR that defines the MSI that is to be sent to the specified CPU when an + * interrupt is raised. + */ +int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, + unsigned long mmr_offset) +{ + int irq; + int ret; + + irq = create_irq(); + if (irq <= 0) + return -EBUSY; + + ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset); + if (ret != irq) + destroy_irq(irq); + + return ret; +} +EXPORT_SYMBOL_GPL(uv_setup_irq); + +/* + * Tear down a mapping of an irq and vector, and disable the specified MMR that + * defined the MSI that was to be sent to the specified CPU when an interrupt + * was raised. + * + * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq(). + */ +void uv_teardown_irq(unsigned int irq, int mmr_blade, unsigned long mmr_offset) +{ + arch_disable_uv_irq(mmr_blade, mmr_offset); + destroy_irq(irq); +} +EXPORT_SYMBOL_GPL(uv_teardown_irq); diff --git a/include/asm-x86/uv/uv_irq.h b/include/asm-x86/uv/uv_irq.h new file mode 100644 index 00000000000..8bf5f32da9c --- /dev/null +++ b/include/asm-x86/uv/uv_irq.h @@ -0,0 +1,36 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * SGI UV IRQ definitions + * + * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved. + */ + +#ifndef ASM_X86__UV__UV_IRQ_H +#define ASM_X86__UV__UV_IRQ_H + +/* If a generic version of this structure gets defined, eliminate this one. */ +struct uv_IO_APIC_route_entry { + __u64 vector : 8, + delivery_mode : 3, + dest_mode : 1, + delivery_status : 1, + polarity : 1, + __reserved_1 : 1, + trigger : 1, + mask : 1, + __reserved_2 : 15, + dest : 32; +}; + +extern struct irq_chip uv_irq_chip; + +extern int arch_enable_uv_irq(char *, unsigned int, int, int, unsigned long); +extern void arch_disable_uv_irq(int, unsigned long); + +extern int uv_setup_irq(char *, int, int, unsigned long); +extern void uv_teardown_irq(unsigned int, int, unsigned long); + +#endif /* ASM_X86__UV__UV_IRQ_H */ -- cgit v1.2.3-70-g09d2 From 37762b6ffb37f9e21cbc6f80902aa06b7a053fd7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Oct 2008 11:38:37 +0200 Subject: x86, UV: add uv_setup_irq() and uv_teardown_irq() functions, v3, fix fix: arch/x86/kernel/uv_irq.c: In function 'uv_ack_apic': arch/x86/kernel/uv_irq.c:26: error: implicit declaration of function 'ack_APIC_irq' Signed-off-by: Ingo Molnar --- arch/x86/kernel/uv_irq.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c index 6bd26c91c30..aeef529917e 100644 --- a/arch/x86/kernel/uv_irq.c +++ b/arch/x86/kernel/uv_irq.c @@ -10,6 +10,8 @@ #include #include + +#include #include static void uv_noop(unsigned int irq) -- cgit v1.2.3-70-g09d2 From a50f70b17541c0060967c6df61133e968bad3652 Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Fri, 3 Oct 2008 11:58:54 -0500 Subject: x86: Add UV EFI table entry v4 Look for a UV entry in the EFI tables. Signed-off-by: Russ Anderson Signed-off-by: Paul Jackson Acked-by: Huang Ying Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/kernel/efi.c | 4 ++++ include/linux/efi.h | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index 945a31cdd81..1119d247fe1 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -366,6 +366,10 @@ void __init efi_init(void) SMBIOS_TABLE_GUID)) { efi.smbios = config_tables[i].table; printk(" SMBIOS=0x%lx ", config_tables[i].table); + } else if (!efi_guidcmp(config_tables[i].guid, + UV_SYSTEM_TABLE_GUID)) { + efi.uv_systab = config_tables[i].table; + printk(" UVsystab=0x%lx ", config_tables[i].table); } else if (!efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID)) { efi.hcdp = config_tables[i].table; diff --git a/include/linux/efi.h b/include/linux/efi.h index 807373d467f..bb66feb164b 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -208,6 +208,9 @@ typedef efi_status_t efi_set_virtual_address_map_t (unsigned long memory_map_siz #define EFI_GLOBAL_VARIABLE_GUID \ EFI_GUID( 0x8be4df61, 0x93ca, 0x11d2, 0xaa, 0x0d, 0x00, 0xe0, 0x98, 0x03, 0x2b, 0x8c ) +#define UV_SYSTEM_TABLE_GUID \ + EFI_GUID( 0x3b13a7d4, 0x633e, 0x11dd, 0x93, 0xec, 0xda, 0x25, 0x56, 0xd8, 0x95, 0x93 ) + typedef struct { efi_guid_t guid; unsigned long table; @@ -255,6 +258,7 @@ extern struct efi { unsigned long boot_info; /* boot info table */ unsigned long hcdp; /* HCDP table */ unsigned long uga; /* UGA table */ + unsigned long uv_systab; /* UV system table */ efi_get_time_t *get_time; efi_set_time_t *set_time; efi_get_wakeup_time_t *get_wakeup_time; -- cgit v1.2.3-70-g09d2 From 7f5942329e0787087a5e4dced838cee711ac2b58 Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Fri, 3 Oct 2008 11:59:15 -0500 Subject: x86: Add UV bios call infrastructure v4 Add the EFI callback function and associated wrapper code. Initialize SAL system table entry info at boot time. Signed-off-by: Russ Anderson Signed-off-by: Paul Jackson Acked-by: Huang Ying Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/kernel/bios_uv.c | 101 +++++++++++++++++++++++++++++++-------- arch/x86/kernel/genx2apic_uv_x.c | 1 + include/asm-x86/efi.h | 13 +++++ include/asm-x86/uv/bios.h | 73 +++++++++++++++------------- 4 files changed, 135 insertions(+), 53 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index fdd585f9c53..5481eb59f78 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -1,8 +1,6 @@ /* * BIOS run time interface routines. * - * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. - * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -16,33 +14,94 @@ * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) Russ Anderson */ +#include +#include +#include #include -const char * -x86_bios_strerror(long status) +struct uv_systab uv_systab; + +s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) { - const char *str; - switch (status) { - case 0: str = "Call completed without error"; break; - case -1: str = "Not implemented"; break; - case -2: str = "Invalid argument"; break; - case -3: str = "Call completed with error"; break; - default: str = "Unknown BIOS status code"; break; - } - return str; + struct uv_systab *tab = &uv_systab; + + if (!tab->function) + /* + * BIOS does not support UV systab + */ + return BIOS_STATUS_UNIMPLEMENTED; + + return efi_call6((void *)__va(tab->function), + (u64)which, a1, a2, a3, a4, a5); } -long -x86_bios_freq_base(unsigned long which, unsigned long *ticks_per_second, - unsigned long *drift_info) +s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, + u64 a4, u64 a5) { - struct uv_bios_retval isrv; + unsigned long bios_flags; + s64 ret; + + local_irq_save(bios_flags); + ret = uv_bios_call(which, a1, a2, a3, a4, a5); + local_irq_restore(bios_flags); + + return ret; +} + +s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, + u64 a4, u64 a5) +{ + s64 ret; + + preempt_disable(); + ret = uv_bios_call(which, a1, a2, a3, a4, a5); + preempt_enable(); - BIOS_CALL(isrv, BIOS_FREQ_BASE, which, 0, 0, 0, 0, 0, 0); - *ticks_per_second = isrv.v0; - *drift_info = isrv.v1; - return isrv.status; + return ret; +} + +long +x86_bios_freq_base(unsigned long clock_type, unsigned long *ticks_per_second, + unsigned long *drift_info) +{ + return uv_bios_call(UV_BIOS_FREQ_BASE, clock_type, + (u64)ticks_per_second, 0, 0, 0); } EXPORT_SYMBOL_GPL(x86_bios_freq_base); + + +#ifdef CONFIG_EFI +void uv_bios_init(void) +{ + struct uv_systab *tab; + + if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) || + (efi.uv_systab == (unsigned long)NULL)) { + printk(KERN_CRIT "No EFI UV System Table.\n"); + uv_systab.function = (unsigned long)NULL; + return; + } + + tab = (struct uv_systab *)ioremap(efi.uv_systab, + sizeof(struct uv_systab)); + if (strncmp(tab->signature, "UVST", 4) != 0) + printk(KERN_ERR "bad signature in UV system table!"); + + /* + * Copy table to permanent spot for later use. + */ + memcpy(&uv_systab, tab, sizeof(struct uv_systab)); + iounmap(tab); + + printk(KERN_INFO "EFI UV System Table Revision %d\n", tab->revision); +} +#else /* !CONFIG_EFI */ + +void uv_bios_init(void) { } +#endif + diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index b689075bbe2..aa2e5e15bf6 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -427,6 +427,7 @@ void __init uv_system_init(void) gnode_upper = (((unsigned long)node_id.s.node_id) & ~((1 << n_val) - 1)) << m_val; + uv_bios_init(); uv_rtc_init(); for_each_present_cpu(cpu) { diff --git a/include/asm-x86/efi.h b/include/asm-x86/efi.h index ed2de22e870..313438e6334 100644 --- a/include/asm-x86/efi.h +++ b/include/asm-x86/efi.h @@ -94,4 +94,17 @@ extern void efi_reserve_early(void); extern void efi_call_phys_prelog(void); extern void efi_call_phys_epilog(void); +#ifndef CONFIG_EFI +/* + * IF EFI is not configured, have the EFI calls return -ENOSYS. + */ +#define efi_call0(_f) (-ENOSYS) +#define efi_call1(_f, _a1) (-ENOSYS) +#define efi_call2(_f, _a1, _a2) (-ENOSYS) +#define efi_call3(_f, _a1, _a2, _a3) (-ENOSYS) +#define efi_call4(_f, _a1, _a2, _a3, _a4) (-ENOSYS) +#define efi_call5(_f, _a1, _a2, _a3, _a4, _a5) (-ENOSYS) +#define efi_call6(_f, _a1, _a2, _a3, _a4, _a5, _a6) (-ENOSYS) +#endif /* CONFIG_EFI */ + #endif /* ASM_X86__EFI_H */ diff --git a/include/asm-x86/uv/bios.h b/include/asm-x86/uv/bios.h index 7cd6d7ec130..f63e46e5337 100644 --- a/include/asm-x86/uv/bios.h +++ b/include/asm-x86/uv/bios.h @@ -2,9 +2,7 @@ #define ASM_X86__UV__BIOS_H /* - * BIOS layer definitions. - * - * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. + * UV BIOS layer definitions. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -19,50 +17,61 @@ * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) Russ Anderson */ #include -#define BIOS_FREQ_BASE 0x01000001 +/* + * Values for the BIOS calls. It is passed as the first * argument in the + * BIOS call. Passing any other value in the first argument will result + * in a BIOS_STATUS_UNIMPLEMENTED return status. + */ +enum uv_bios_cmd { + UV_BIOS_COMMON, + UV_BIOS_GET_SN_INFO, + UV_BIOS_FREQ_BASE +}; +/* + * Status values returned from a BIOS call. + */ enum { - BIOS_FREQ_BASE_PLATFORM = 0, - BIOS_FREQ_BASE_INTERVAL_TIMER = 1, - BIOS_FREQ_BASE_REALTIME_CLOCK = 2 + BIOS_STATUS_SUCCESS = 0, + BIOS_STATUS_UNIMPLEMENTED = -ENOSYS, + BIOS_STATUS_EINVAL = -EINVAL, + BIOS_STATUS_UNAVAIL = -EBUSY }; -# define BIOS_CALL(result, a0, a1, a2, a3, a4, a5, a6, a7) \ - do { \ - /* XXX - the real call goes here */ \ - result.status = BIOS_STATUS_UNIMPLEMENTED; \ - isrv.v0 = 0; \ - isrv.v1 = 0; \ - } while (0) +/* + * The UV system table describes specific firmware + * capabilities available to the Linux kernel at runtime. + */ +struct uv_systab { + char signature[4]; /* must be "UVST" */ + u32 revision; /* distinguish different firmware revs */ + u64 function; /* BIOS runtime callback function ptr */ +}; enum { - BIOS_STATUS_SUCCESS = 0, - BIOS_STATUS_UNIMPLEMENTED = -1, - BIOS_STATUS_EINVAL = -2, - BIOS_STATUS_ERROR = -3 + BIOS_FREQ_BASE_PLATFORM = 0, + BIOS_FREQ_BASE_INTERVAL_TIMER = 1, + BIOS_FREQ_BASE_REALTIME_CLOCK = 2 }; -struct uv_bios_retval { - /* - * A zero status value indicates call completed without error. - * A negative status value indicates reason of call failure. - * A positive status value indicates success but an - * informational value should be printed (e.g., "reboot for - * change to take effect"). - */ - s64 status; - u64 v0; - u64 v1; - u64 v2; -}; +/* + * bios calls have 6 parameters + */ +extern s64 uv_bios_call(enum uv_bios_cmd, u64, u64, u64, u64, u64); +extern s64 uv_bios_call_irqsave(enum uv_bios_cmd, u64, u64, u64, u64, u64); +extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64); + +extern void uv_bios_init(void); extern long x86_bios_freq_base(unsigned long which, unsigned long *ticks_per_second, unsigned long *drift_info); -extern const char *x86_bios_strerror(long status); #endif /* ASM_X86__UV__BIOS_H */ -- cgit v1.2.3-70-g09d2 From 922402f15a85f7a064926eb1db68cc52bc4d4a91 Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Fri, 3 Oct 2008 11:59:33 -0500 Subject: x86: Add UV partition call v4 Add a bios call to return partitioning related info. Signed-off-by: Russ Anderson Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/kernel/bios_uv.c | 44 +++++++++++++++++++++++++++++++++++----- arch/x86/kernel/genx2apic_uv_x.c | 14 +++++++------ include/asm-x86/uv/bios.h | 22 +++++++++++++++++--- 3 files changed, 66 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index 5481eb59f78..f0dfe6f17e7 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -23,6 +23,7 @@ #include #include #include +#include struct uv_systab uv_systab; @@ -65,14 +66,47 @@ s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, return ret; } -long -x86_bios_freq_base(unsigned long clock_type, unsigned long *ticks_per_second, - unsigned long *drift_info) + +long sn_partition_id; +EXPORT_SYMBOL_GPL(sn_partition_id); +long uv_coherency_id; +EXPORT_SYMBOL_GPL(uv_coherency_id); +long uv_region_size; +EXPORT_SYMBOL_GPL(uv_region_size); +int uv_type; + + +s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, + long *region) +{ + s64 ret; + u64 v0, v1; + union partition_info_u part; + + ret = uv_bios_call_irqsave(UV_BIOS_GET_SN_INFO, fc, + (u64)(&v0), (u64)(&v1), 0, 0); + if (ret != BIOS_STATUS_SUCCESS) + return ret; + + part.val = v0; + if (uvtype) + *uvtype = part.hub_version; + if (partid) + *partid = part.partition_id; + if (coher) + *coher = part.coherence_id; + if (region) + *region = part.region_size; + return ret; +} + + +s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second) { return uv_bios_call(UV_BIOS_FREQ_BASE, clock_type, - (u64)ticks_per_second, 0, 0, 0); + (u64)ticks_per_second, 0, 0, 0); } -EXPORT_SYMBOL_GPL(x86_bios_freq_base); +EXPORT_SYMBOL_GPL(uv_bios_freq_base); #ifdef CONFIG_EFI diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index aa2e5e15bf6..bfd532843df 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -341,12 +341,12 @@ static __init void map_mmioh_high(int max_pnode) static __init void uv_rtc_init(void) { - long status, ticks_per_sec, drift; + long status; + u64 ticks_per_sec; - status = - x86_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec, - &drift); - if (status != 0 || ticks_per_sec < 100000) { + status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, + &ticks_per_sec); + if (status != BIOS_STATUS_SUCCESS || ticks_per_sec < 100000) { printk(KERN_WARNING "unable to determine platform RTC clock frequency, " "guessing.\n"); @@ -428,6 +428,8 @@ void __init uv_system_init(void) ~((1 << n_val) - 1)) << m_val; uv_bios_init(); + uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, + &uv_coherency_id, &uv_region_size); uv_rtc_init(); for_each_present_cpu(cpu) { @@ -449,7 +451,7 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; - uv_cpu_hub_info(cpu)->coherency_domain_number = 0;/* ZZZ */ + uv_cpu_hub_info(cpu)->coherency_domain_number = uv_coherency_id; uv_node_to_blade[nid] = blade; uv_cpu_to_blade[cpu] = blade; max_pnode = max(pnode, max_pnode); diff --git a/include/asm-x86/uv/bios.h b/include/asm-x86/uv/bios.h index f63e46e5337..3b305d897c8 100644 --- a/include/asm-x86/uv/bios.h +++ b/include/asm-x86/uv/bios.h @@ -61,6 +61,16 @@ enum { BIOS_FREQ_BASE_REALTIME_CLOCK = 2 }; +union partition_info_u { + u64 val; + struct { + u64 hub_version : 8, + partition_id : 16, + coherence_id : 16, + region_size : 24; + }; +}; + /* * bios calls have 6 parameters */ @@ -68,10 +78,16 @@ extern s64 uv_bios_call(enum uv_bios_cmd, u64, u64, u64, u64, u64); extern s64 uv_bios_call_irqsave(enum uv_bios_cmd, u64, u64, u64, u64, u64); extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64); +extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *); +extern s64 uv_bios_freq_base(u64, u64 *); + extern void uv_bios_init(void); -extern long -x86_bios_freq_base(unsigned long which, unsigned long *ticks_per_second, - unsigned long *drift_info); +extern int uv_type; +extern long sn_partition_id; +extern long uv_coherency_id; +extern long uv_region_size; +#define partition_coherence_id() (uv_coherency_id) + #endif /* ASM_X86__UV__BIOS_H */ -- cgit v1.2.3-70-g09d2 From fc8c2d763bacc02962048fa042e287debb1416aa Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Fri, 3 Oct 2008 11:59:50 -0500 Subject: x86: Add sysfs entries for UV v4 Create /sys/firmware/sgi_uv sysfs entries for partition_id and coherence_id. Signed-off-by: Russ Anderson Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/uv_sysfs.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++ include/asm-x86/uv/bios.h | 1 + 3 files changed, 74 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/uv_sysfs.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index acc0e8bf043..cb6ade443f0 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -108,7 +108,7 @@ obj-$(CONFIG_MICROCODE) += microcode.o # 64 bit specific files ifeq ($(CONFIG_X86_64),y) obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o - obj-y += bios_uv.o uv_irq.o + obj-y += bios_uv.o uv_irq.o uv_sysfs.o obj-y += genx2apic_cluster.o obj-y += genx2apic_phys.o obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c new file mode 100644 index 00000000000..67f9b9dbf80 --- /dev/null +++ b/arch/x86/kernel/uv_sysfs.c @@ -0,0 +1,72 @@ +/* + * This file supports the /sys/firmware/sgi_uv interfaces for SGI UV. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) Russ Anderson + */ + +#include +#include + +struct kobject *sgi_uv_kobj; + +static ssize_t partition_id_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%ld\n", sn_partition_id); +} + +static ssize_t coherence_id_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%ld\n", partition_coherence_id()); +} + +static struct kobj_attribute partition_id_attr = + __ATTR(partition_id, S_IRUGO, partition_id_show, NULL); + +static struct kobj_attribute coherence_id_attr = + __ATTR(coherence_id, S_IRUGO, coherence_id_show, NULL); + + +static int __init sgi_uv_sysfs_init(void) +{ + unsigned long ret; + + if (!sgi_uv_kobj) + sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj); + if (!sgi_uv_kobj) { + printk(KERN_WARNING "kobject_create_and_add sgi_uv failed \n"); + return -EINVAL; + } + + ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr); + if (ret) { + printk(KERN_WARNING "sysfs_create_file partition_id failed \n"); + return ret; + } + + ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr); + if (ret) { + printk(KERN_WARNING "sysfs_create_file coherence_id failed \n"); + return ret; + } + + return 0; +} + +device_initcall(sgi_uv_sysfs_init); diff --git a/include/asm-x86/uv/bios.h b/include/asm-x86/uv/bios.h index 3b305d897c8..215f1969c26 100644 --- a/include/asm-x86/uv/bios.h +++ b/include/asm-x86/uv/bios.h @@ -89,5 +89,6 @@ extern long uv_coherency_id; extern long uv_region_size; #define partition_coherence_id() (uv_coherency_id) +extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */ #endif /* ASM_X86__UV__BIOS_H */ -- cgit v1.2.3-70-g09d2 From 4c66a73f0796dacc2ff0d4af75794ec843ceb3d1 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 4 Oct 2008 15:52:15 -0700 Subject: x86: sparse_irq: fix typo in debug print out Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 260c95a5e6d..f959acbc0db 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -257,7 +257,7 @@ static struct irq_cfg *irq_cfg_alloc(unsigned int irq) panic("please boot with nr_irq_cfg= %d\n", count * 2); phys = __pa(cfg); - printk(KERN_DEBUG "irq_irq ==> [%#lx - %#lx]\n", phys, phys + total_bytes); + printk(KERN_DEBUG "irq_cfg ==> [%#lx - %#lx]\n", phys, phys + total_bytes); for (i = 0; i < nr_irq_cfg; i++) init_one_irq_cfg(&cfg[i]); -- cgit v1.2.3-70-g09d2 From 81608f3c254512b906ab78082ec5966b376aacd5 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Fri, 10 Oct 2008 19:00:17 +0400 Subject: x86: apic - unify APIC_DIVISOR Use APIC_DIVISOR being set to 16 for both 32/64bit mode. To escape APIC timer underflow during calibration set it to the maximum possible value. Also typo error (CONFG instead of proper CONFIG) fixed. The error was caught by Venkatesh Pallipadi, thanks a lot Venkatesh! See details on http://lkml.org/lkml/2008/10/9/425 Reported-by: Venkatesh Pallipad Signed-off-by: Cyrill Gorcunov Acked-by: "Maciej W. Rozycki" Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 1f48bd1c9f2..04a7f960bbc 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -332,11 +332,7 @@ int lapic_get_maxlvt(void) */ /* Clock divisor */ -#ifdef CONFG_X86_64 -#define APIC_DIVISOR 1 -#else #define APIC_DIVISOR 16 -#endif /* * This function sets up the local APIC timer, with a timeout of @@ -592,10 +588,10 @@ static int __init calibrate_APIC_clock(void) global_clock_event->event_handler = lapic_cal_handler; /* - * Setup the APIC counter to 1e9. There is no way the lapic + * Setup the APIC counter to maximum. There is no way the lapic * can underflow in the 100ms detection time frame */ - __setup_APIC_LVTT(1000000000, 0, 0); + __setup_APIC_LVTT(0xffffffff, 0, 0); /* Let the interrupts run */ local_irq_enable(); -- cgit v1.2.3-70-g09d2 From 3235e936c0cc3589309280b6f59e5096779adae3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 15 Oct 2008 13:16:00 +0200 Subject: x86: remove sparse irq from Kconfig This code is not ready yet. Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 600584b7a49..8636ddf2f4a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -237,17 +237,6 @@ config SMP If you don't know what to do here, say N. -config HAVE_SPARSE_IRQ - bool "Support sparse irq numbering" - depends on PCI_MSI || HT_IRQ - default y - help - This enables support for sparse irq, esp for msi/msi-x. the irq - number will be bus/dev/fn + 12bit. You may need if you have lots of - cards supports msi-x installed. - - If you don't know what to do here, say Y. - config X86_FIND_SMP_CONFIG def_bool y depends on X86_MPPARSE || X86_VOYAGER -- cgit v1.2.3-70-g09d2 From 2cc21ef843d4fb7da122239b644a1f6f0aca60a6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 15 Oct 2008 14:16:55 +0200 Subject: genirq: remove sparse irq code This code is not ready, but we need to rip it out instead of rebasing as we would lose the APIC/IO_APIC unification otherwise. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/io_apic.c | 130 ++----------------------------------------- arch/x86/kernel/irq_32.c | 8 --- arch/x86/kernel/irq_64.c | 8 --- drivers/char/random.c | 31 ----------- drivers/pci/htirq.c | 19 +------ drivers/pci/intr_remapping.c | 75 ------------------------- fs/proc/proc_misc.c | 43 ++------------ include/linux/irq.h | 20 ------- kernel/irq/handle.c | 114 ------------------------------------- 9 files changed, 10 insertions(+), 438 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index f959acbc0db..683610517d2 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -111,9 +111,6 @@ struct irq_cfg; struct irq_pin_list; struct irq_cfg { unsigned int irq; -#ifdef CONFIG_HAVE_SPARSE_IRQ - struct irq_cfg *next; -#endif struct irq_pin_list *irq_2_pin; cpumask_t domain; cpumask_t old_domain; @@ -151,15 +148,6 @@ static void init_one_irq_cfg(struct irq_cfg *cfg) static struct irq_cfg *irq_cfgx; -#ifdef CONFIG_HAVE_SPARSE_IRQ -/* - * Protect the irq_cfgx_free freelist: - */ -static DEFINE_SPINLOCK(irq_cfg_lock); - -static struct irq_cfg *irq_cfgx_free; -#endif - static void __init init_work(void *data) { struct dyn_array *da = data; @@ -174,114 +162,7 @@ static void __init init_work(void *data) legacy_count = ARRAY_SIZE(irq_cfg_legacy); for (i = legacy_count; i < *da->nr; i++) init_one_irq_cfg(&cfg[i]); - -#ifdef CONFIG_HAVE_SPARSE_IRQ - for (i = 1; i < *da->nr; i++) - cfg[i-1].next = &cfg[i]; - - irq_cfgx_free = &irq_cfgx[legacy_count]; - irq_cfgx[legacy_count - 1].next = NULL; -#endif -} - -#ifdef CONFIG_HAVE_SPARSE_IRQ -/* need to be biger than size of irq_cfg_legacy */ -static int nr_irq_cfg = 32; - -static int __init parse_nr_irq_cfg(char *arg) -{ - if (arg) { - nr_irq_cfg = simple_strtoul(arg, NULL, 0); - if (nr_irq_cfg < 32) - nr_irq_cfg = 32; - } - return 0; -} - -early_param("nr_irq_cfg", parse_nr_irq_cfg); - -#define for_each_irq_cfg(irqX, cfg) \ - for (cfg = irq_cfgx, irqX = cfg->irq; cfg; cfg = cfg->next, irqX = cfg ? cfg->irq : -1U) - - -DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irq_cfg, PAGE_SIZE, init_work); - -static struct irq_cfg *irq_cfg(unsigned int irq) -{ - struct irq_cfg *cfg; - - cfg = irq_cfgx; - while (cfg) { - if (cfg->irq == irq) - return cfg; - - cfg = cfg->next; - } - - return NULL; -} - -static struct irq_cfg *irq_cfg_alloc(unsigned int irq) -{ - struct irq_cfg *cfg, *cfg_pri; - unsigned long flags; - int count = 0; - int i; - - cfg_pri = cfg = irq_cfgx; - while (cfg) { - if (cfg->irq == irq) - return cfg; - - cfg_pri = cfg; - cfg = cfg->next; - count++; - } - - spin_lock_irqsave(&irq_cfg_lock, flags); - if (!irq_cfgx_free) { - unsigned long phys; - unsigned long total_bytes; - /* - * we run out of pre-allocate ones, allocate more - */ - printk(KERN_DEBUG "try to get more irq_cfg %d\n", nr_irq_cfg); - - total_bytes = sizeof(struct irq_cfg) * nr_irq_cfg; - if (after_bootmem) - cfg = kzalloc(total_bytes, GFP_ATOMIC); - else - cfg = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0); - - if (!cfg) - panic("please boot with nr_irq_cfg= %d\n", count * 2); - - phys = __pa(cfg); - printk(KERN_DEBUG "irq_cfg ==> [%#lx - %#lx]\n", phys, phys + total_bytes); - - for (i = 0; i < nr_irq_cfg; i++) - init_one_irq_cfg(&cfg[i]); - - for (i = 1; i < nr_irq_cfg; i++) - cfg[i-1].next = &cfg[i]; - - irq_cfgx_free = cfg; - } - - cfg = irq_cfgx_free; - irq_cfgx_free = irq_cfgx_free->next; - cfg->next = NULL; - if (cfg_pri) - cfg_pri->next = cfg; - else - irq_cfgx = cfg; - cfg->irq = irq; - - spin_unlock_irqrestore(&irq_cfg_lock, flags); - - return cfg; } -#else #define for_each_irq_cfg(irq, cfg) \ for (irq = 0, cfg = &irq_cfgx[irq]; irq < nr_irqs; irq++, cfg = &irq_cfgx[irq]) @@ -290,17 +171,16 @@ DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irqs, PAGE_SIZE, init_work struct irq_cfg *irq_cfg(unsigned int irq) { - if (irq < nr_irqs) - return &irq_cfgx[irq]; + if (irq < nr_irqs) + return &irq_cfgx[irq]; - return NULL; + return NULL; } struct irq_cfg *irq_cfg_alloc(unsigned int irq) { - return irq_cfg(irq); + return irq_cfg(irq); } -#endif /* * This is performance-critical, we want to do it O(1) * @@ -3068,9 +2948,7 @@ unsigned int create_irq_nr(unsigned int irq_want) unsigned long flags; struct irq_cfg *cfg_new; -#ifndef CONFIG_HAVE_SPARSE_IRQ irq_want = nr_irqs - 1; -#endif irq = 0; spin_lock_irqsave(&vector_lock, flags); diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 001772ffc91..ccf6c1bf712 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -272,20 +272,12 @@ int show_interrupts(struct seq_file *p, void *v) struct irq_desc *desc = NULL; int tail = 0; -#ifdef CONFIG_HAVE_SPARSE_IRQ - desc = (struct irq_desc *)v; - entries = -1U; - i = desc->irq; - if (!desc->next) - tail = 1; -#else entries = nr_irqs - 1; i = *(loff_t *) v; if (i == nr_irqs) tail = 1; else desc = irq_to_desc(i); -#endif if (i == 0) { seq_printf(p, " "); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index ec266109128..21f53b91111 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -77,20 +77,12 @@ int show_interrupts(struct seq_file *p, void *v) struct irq_desc *desc = NULL; int tail = 0; -#ifdef CONFIG_HAVE_SPARSE_IRQ - desc = (struct irq_desc *)v; - entries = -1U; - i = desc->irq; - if (!desc->next) - tail = 1; -#else entries = nr_irqs - 1; i = *(loff_t *) v; if (i == nr_irqs) tail = 1; else desc = irq_to_desc(i); -#endif if (i == 0) { seq_printf(p, " "); diff --git a/drivers/char/random.c b/drivers/char/random.c index 60c9c7ee6b2..9ce80213007 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -558,8 +558,6 @@ struct timer_rand_state { unsigned dont_count_entropy:1; }; -#ifndef CONFIG_HAVE_SPARSE_IRQ - #ifdef CONFIG_HAVE_DYN_ARRAY static struct timer_rand_state **irq_timer_state; DEFINE_DYN_ARRAY(irq_timer_state, sizeof(struct timer_rand_state *), nr_irqs, PAGE_SIZE, NULL); @@ -583,33 +581,6 @@ static void set_timer_rand_state(unsigned int irq, struct timer_rand_state *stat irq_timer_state[irq] = state; } -#else - -static struct timer_rand_state *get_timer_rand_state(unsigned int irq) -{ - struct irq_desc *desc; - - desc = irq_to_desc(irq); - - if (!desc) - return NULL; - - return desc->timer_rand_state; -} - -static void set_timer_rand_state(unsigned int irq, struct timer_rand_state *state) -{ - struct irq_desc *desc; - - desc = irq_to_desc(irq); - - if (!desc) - return; - - desc->timer_rand_state = state; -} -#endif - static struct timer_rand_state input_timer_state; /* @@ -967,10 +938,8 @@ void rand_initialize_irq(int irq) { struct timer_rand_state *state; -#ifndef CONFIG_HAVE_SPARSE_IRQ if (irq >= nr_irqs) return; -#endif state = get_timer_rand_state(irq); diff --git a/drivers/pci/htirq.c b/drivers/pci/htirq.c index 9e4929a0083..bf7d6ce9bbb 100644 --- a/drivers/pci/htirq.c +++ b/drivers/pci/htirq.c @@ -82,18 +82,6 @@ void unmask_ht_irq(unsigned int irq) write_ht_irq_msg(irq, &msg); } -static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) -{ - unsigned int irq; - - irq = dev->bus->number; - irq <<= 8; - irq |= dev->devfn; - irq <<= 12; - - return irq; -} - /** * __ht_create_irq - create an irq and attach it to a device. * @dev: The hypertransport device to find the irq capability on. @@ -110,7 +98,6 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update) int max_irq; int pos; int irq; - unsigned int irq_want; pos = pci_find_ht_capability(dev, HT_CAPTYPE_IRQ); if (!pos) @@ -138,12 +125,8 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update) cfg->msg.address_lo = 0xffffffff; cfg->msg.address_hi = 0xffffffff; - irq_want= build_irq_for_pci_dev(dev); -#ifdef CONFIG_HAVE_SPARSE_IRQ - irq = create_irq_nr(irq_want + idx); -#else irq = create_irq(); -#endif + if (irq <= 0) { kfree(cfg); return -EBUSY; diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c index 2dcf973890c..0f43b265eee 100644 --- a/drivers/pci/intr_remapping.c +++ b/drivers/pci/intr_remapping.c @@ -19,78 +19,6 @@ struct irq_2_iommu { u8 irte_mask; }; -#ifdef CONFIG_HAVE_SPARSE_IRQ -static struct irq_2_iommu *irq_2_iommuX; -/* fill one page ? */ -static int nr_irq_2_iommu = 0x100; -static int irq_2_iommu_index; -DEFINE_DYN_ARRAY(irq_2_iommuX, sizeof(struct irq_2_iommu), nr_irq_2_iommu, PAGE_SIZE, NULL); - -extern void *__alloc_bootmem_nopanic(unsigned long size, - unsigned long align, - unsigned long goal); - -static struct irq_2_iommu *get_one_free_irq_2_iommu(int not_used) -{ - struct irq_2_iommu *iommu; - unsigned long total_bytes; - - if (irq_2_iommu_index >= nr_irq_2_iommu) { - /* - * we run out of pre-allocate ones, allocate more - */ - printk(KERN_DEBUG "try to get more irq_2_iommu %d\n", nr_irq_2_iommu); - - total_bytes = sizeof(struct irq_2_iommu)*nr_irq_2_iommu; - - if (after_bootmem) - iommu = kzalloc(total_bytes, GFP_ATOMIC); - else - iommu = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0); - - if (!iommu) - panic("can not get more irq_2_iommu\n"); - - irq_2_iommuX = iommu; - irq_2_iommu_index = 0; - } - - iommu = &irq_2_iommuX[irq_2_iommu_index]; - irq_2_iommu_index++; - return iommu; -} - -static struct irq_2_iommu *irq_2_iommu(unsigned int irq) -{ - struct irq_desc *desc; - - desc = irq_to_desc(irq); - - BUG_ON(!desc); - - return desc->irq_2_iommu; -} - -static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq) -{ - struct irq_desc *desc; - struct irq_2_iommu *irq_iommu; - - /* - * alloc irq desc if not allocated already. - */ - desc = irq_to_desc_alloc(irq); - - irq_iommu = desc->irq_2_iommu; - - if (!irq_iommu) - desc->irq_2_iommu = get_one_free_irq_2_iommu(irq); - - return desc->irq_2_iommu; -} - -#else /* !CONFIG_HAVE_SPARSE_IRQ */ - #ifdef CONFIG_HAVE_DYN_ARRAY static struct irq_2_iommu *irq_2_iommuX; DEFINE_DYN_ARRAY(irq_2_iommuX, sizeof(struct irq_2_iommu), nr_irqs, PAGE_SIZE, NULL); @@ -109,7 +37,6 @@ static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq) { return irq_2_iommu(irq); } -#endif static DEFINE_SPINLOCK(irq_2_ir_lock); @@ -166,11 +93,9 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count) if (!count) return -1; -#ifndef CONFIG_HAVE_SPARSE_IRQ /* protect irq_2_iommu_alloc later */ if (irq >= nr_irqs) return -1; -#endif /* * start the IRTE search from index 0. diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index d68c3592fe4..3f5c7b9d1a7 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -529,13 +529,10 @@ static int show_stat(struct seq_file *p, void *v) softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); + for_each_irq_desc(j, desc) - { - unsigned int temp; + sum += kstat_irqs_cpu(j, i); - temp = kstat_irqs_cpu(j, i); - sum += temp; - } sum += arch_irq_stat_cpu(i); } sum += arch_irq_stat(); @@ -578,21 +575,13 @@ static int show_stat(struct seq_file *p, void *v) seq_printf(p, "intr %llu", (unsigned long long)sum); /* sum again ? it could be updated? */ - for_each_irq_desc(j, desc) - { + for_each_irq_desc(j, desc) { per_irq_sum = 0; - for_each_possible_cpu(i) { - unsigned int temp; - temp = kstat_irqs_cpu(j, i); - per_irq_sum += temp; - } + for_each_possible_cpu(i) + per_irq_sum += kstat_irqs_cpu(j, i); -#ifdef CONFIG_HAVE_SPARSE_IRQ - seq_printf(p, " %#x:%u", j, per_irq_sum); -#else seq_printf(p, " %u", per_irq_sum); -#endif } seq_printf(p, @@ -645,36 +634,14 @@ static const struct file_operations proc_stat_operations = { */ static void *int_seq_start(struct seq_file *f, loff_t *pos) { -#ifdef CONFIG_HAVE_SPARSE_IRQ - struct irq_desc *desc; - int irq; - int count = *pos; - - for_each_irq_desc(irq, desc) { - if (count-- == 0) - return desc; - } - - return NULL; -#else return (*pos <= nr_irqs) ? pos : NULL; -#endif } static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos) { -#ifdef CONFIG_HAVE_SPARSE_IRQ - struct irq_desc *desc; - - desc = ((struct irq_desc *)v)->next; - (*pos)++; - - return desc; -#else (*pos)++; return (*pos <= nr_irqs) ? pos : NULL; -#endif } static void int_seq_stop(struct seq_file *f, void *v) diff --git a/include/linux/irq.h b/include/linux/irq.h index 7d1adacaadb..68e0f3f9df3 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -167,15 +167,8 @@ struct irq_2_iommu; */ struct irq_desc { unsigned int irq; -#ifdef CONFIG_HAVE_SPARSE_IRQ - struct irq_desc *next; - struct timer_rand_state *timer_rand_state; -#endif #ifdef CONFIG_HAVE_DYN_ARRAY unsigned int *kstat_irqs; -#endif -#if defined(CONFIG_INTR_REMAP) && defined(CONFIG_HAVE_SPARSE_IRQ) - struct irq_2_iommu *irq_2_iommu; #endif irq_flow_handler_t handle_irq; struct irq_chip *chip; @@ -205,8 +198,6 @@ struct irq_desc { } ____cacheline_internodealigned_in_smp; -#ifndef CONFIG_HAVE_SPARSE_IRQ - #ifndef CONFIG_HAVE_DYN_ARRAY /* could be removed if we get rid of all irq_desc reference */ extern struct irq_desc irq_desc[NR_IRQS]; @@ -224,17 +215,6 @@ static inline struct irq_desc *irq_to_desc_alloc(unsigned int irq) return irq_to_desc(irq); } -#else - -extern struct irq_desc *irq_to_desc(unsigned int irq); -extern struct irq_desc *irq_to_desc_alloc(unsigned int irq); - -extern struct irq_desc *sparse_irqs; -#define for_each_irq_desc(irqX, desc) \ - for (desc = sparse_irqs, irqX = desc->irq; desc; desc = desc->next, irqX = desc ? desc->irq : -1U) - -#endif - #ifdef CONFIG_HAVE_DYN_ARRAY #define kstat_irqs_this_cpu(DESC) \ ((DESC)->kstat_irqs[smp_processor_id()]) diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index c19896f895f..f837133cdfb 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -111,15 +111,6 @@ static void init_kstat_irqs(struct irq_desc *desc, int nr_desc, int nr) } } -#ifdef CONFIG_HAVE_SPARSE_IRQ -/* - * Protect the sparse_irqs_free freelist: - */ -static DEFINE_SPINLOCK(sparse_irq_lock); -static struct irq_desc *sparse_irqs_free; -struct irq_desc *sparse_irqs; -#endif - static void __init init_work(void *data) { struct dyn_array *da = data; @@ -130,121 +121,16 @@ static void __init init_work(void *data) for (i = 0; i < *da->nr; i++) { init_one_irq_desc(&desc[i]); -#ifndef CONFIG_HAVE_SPARSE_IRQ desc[i].irq = i; -#endif } /* init kstat_irqs, nr_cpu_ids is ready already */ init_kstat_irqs(desc, *da->nr, nr_cpu_ids); - -#ifdef CONFIG_HAVE_SPARSE_IRQ - for (i = 1; i < *da->nr; i++) - desc[i-1].next = &desc[i]; - - sparse_irqs_free = sparse_irqs; - sparse_irqs = NULL; -#endif -} - -#ifdef CONFIG_HAVE_SPARSE_IRQ -static int nr_irq_desc = 32; - -static int __init parse_nr_irq_desc(char *arg) -{ - if (arg) - nr_irq_desc = simple_strtoul(arg, NULL, 0); - return 0; -} - -early_param("nr_irq_desc", parse_nr_irq_desc); - -DEFINE_DYN_ARRAY(sparse_irqs, sizeof(struct irq_desc), nr_irq_desc, PAGE_SIZE, init_work); - -struct irq_desc *irq_to_desc(unsigned int irq) -{ - struct irq_desc *desc; - - desc = sparse_irqs; - while (desc) { - if (desc->irq == irq) - return desc; - - desc = desc->next; - } - return NULL; } -struct irq_desc *irq_to_desc_alloc(unsigned int irq) -{ - struct irq_desc *desc, *desc_pri; - unsigned long flags; - int count = 0; - int i; - - desc_pri = desc = sparse_irqs; - while (desc) { - if (desc->irq == irq) - return desc; - - desc_pri = desc; - desc = desc->next; - count++; - } - - spin_lock_irqsave(&sparse_irq_lock, flags); - /* - * we run out of pre-allocate ones, allocate more - */ - if (!sparse_irqs_free) { - unsigned long phys; - unsigned long total_bytes; - - printk(KERN_DEBUG "try to get more irq_desc %d\n", nr_irq_desc); - - total_bytes = sizeof(struct irq_desc) * nr_irq_desc; - if (after_bootmem) - desc = kzalloc(total_bytes, GFP_ATOMIC); - else - desc = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0); - - if (!desc) - panic("please boot with nr_irq_desc= %d\n", count * 2); - - phys = __pa(desc); - printk(KERN_DEBUG "irq_desc ==> [%#lx - %#lx]\n", phys, phys + total_bytes); - - for (i = 0; i < nr_irq_desc; i++) - init_one_irq_desc(&desc[i]); - - for (i = 1; i < nr_irq_desc; i++) - desc[i-1].next = &desc[i]; - - /* init kstat_irqs, nr_cpu_ids is ready already */ - init_kstat_irqs(desc, nr_irq_desc, nr_cpu_ids); - - sparse_irqs_free = desc; - } - - desc = sparse_irqs_free; - sparse_irqs_free = sparse_irqs_free->next; - desc->next = NULL; - if (desc_pri) - desc_pri->next = desc; - else - sparse_irqs = desc; - desc->irq = irq; - - spin_unlock_irqrestore(&sparse_irq_lock, flags); - - return desc; -} -#else struct irq_desc *irq_desc; DEFINE_DYN_ARRAY(irq_desc, sizeof(struct irq_desc), nr_irqs, PAGE_SIZE, init_work); -#endif - #else struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { -- cgit v1.2.3-70-g09d2 From ee32c9732244bde4b9b59eeac2814c23e2b71f8d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 15 Oct 2008 14:34:09 +0200 Subject: genirq: remove irq_to_desc_alloc Remove the leftover of sparseirqs. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/io_apic.c | 6 +----- arch/x86/kernel/irqinit_32.c | 2 +- arch/x86/kernel/irqinit_64.c | 2 +- include/linux/irq.h | 5 ----- kernel/irq/chip.c | 2 +- 5 files changed, 4 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 683610517d2..e03bc0f87ee 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -1257,11 +1257,7 @@ static void ioapic_register_intr(int irq, unsigned long trigger) { struct irq_desc *desc; - /* first time to use this irq_desc */ - if (irq < 16) - desc = irq_to_desc(irq); - else - desc = irq_to_desc_alloc(irq); + desc = irq_to_desc(irq); if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || trigger == IOAPIC_LEVEL) diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 9092103a18e..a8d35998d30 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -70,7 +70,7 @@ void __init init_ISA_irqs (void) */ for (i = 0; i < 16; i++) { /* first time call this irq_desc */ - struct irq_desc *desc = irq_to_desc_alloc(i); + struct irq_desc *desc = irq_to_desc(i); desc->status = IRQ_DISABLED; desc->action = NULL; diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index d17fbc26d96..ff023539128 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -144,7 +144,7 @@ void __init init_ISA_irqs(void) for (i = 0; i < 16; i++) { /* first time call this irq_desc */ - struct irq_desc *desc = irq_to_desc_alloc(i); + struct irq_desc *desc = irq_to_desc(i); desc->status = IRQ_DISABLED; desc->action = NULL; diff --git a/include/linux/irq.h b/include/linux/irq.h index 68e0f3f9df3..3f33c779030 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -210,11 +210,6 @@ static inline struct irq_desc *irq_to_desc(unsigned int irq) return (irq < nr_irqs) ? irq_desc + irq : NULL; } -static inline struct irq_desc *irq_to_desc_alloc(unsigned int irq) -{ - return irq_to_desc(irq); -} - #ifdef CONFIG_HAVE_DYN_ARRAY #define kstat_irqs_this_cpu(DESC) \ ((DESC)->kstat_irqs[smp_processor_id()]) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 570d1ea1db5..e6f73dbfcc3 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -28,7 +28,7 @@ void dynamic_irq_init(unsigned int irq) unsigned long flags; /* first time to use this irq_desc */ - desc = irq_to_desc_alloc(irq); + desc = irq_to_desc(irq); if (!desc) { WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); return; -- cgit v1.2.3-70-g09d2 From d6c88a507ef0b6afdb013cba4e7804ba7324d99a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 15 Oct 2008 15:27:23 +0200 Subject: genirq: revert dynarray Revert the dynarray changes. They need more thought and polishing. Signed-off-by: Thomas Gleixner --- arch/Kconfig | 4 - arch/x86/Kconfig | 1 - arch/x86/kernel/io_apic.c | 199 ++++++++++++++------------------------ arch/x86/kernel/setup_percpu.c | 8 +- arch/x86/kernel/visws_quirks.c | 2 +- arch/x86/kernel/vmlinux_32.lds.S | 1 - arch/x86/kernel/vmlinux_64.lds.S | 2 - arch/x86/xen/spinlock.c | 2 +- drivers/char/random.c | 5 - drivers/pci/intr_remapping.c | 11 +-- include/asm-generic/vmlinux.lds.h | 13 --- include/linux/init.h | 43 -------- include/linux/irq.h | 15 --- include/linux/kernel_stat.h | 16 ++- init/Makefile | 2 +- init/dyn_array.c | 120 ----------------------- init/main.c | 11 +-- kernel/irq/chip.c | 30 +----- kernel/irq/handle.c | 114 ++-------------------- 19 files changed, 103 insertions(+), 496 deletions(-) delete mode 100644 init/dyn_array.c (limited to 'arch/x86') diff --git a/arch/Kconfig b/arch/Kconfig index c8a7c2eb649..071004d3a1b 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -102,7 +102,3 @@ config HAVE_CLK help The calls support software clock gating and thus are a key power management tool on many systems. - -config HAVE_DYN_ARRAY - def_bool n - diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8636ddf2f4a..8da6123a60d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -33,7 +33,6 @@ config X86 select HAVE_ARCH_TRACEHOOK select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_EFFICIENT_UNALIGNED_ACCESS - select HAVE_DYN_ARRAY config ARCH_DEFCONFIG string diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index e03bc0f87ee..6f80dc2f137 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -107,7 +107,6 @@ static int __init parse_noapic(char *str) } early_param("noapic", parse_noapic); -struct irq_cfg; struct irq_pin_list; struct irq_cfg { unsigned int irq; @@ -120,7 +119,7 @@ struct irq_cfg { }; /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ -static struct irq_cfg irq_cfg_legacy[] __initdata = { +static struct irq_cfg irq_cfgx[NR_IRQS] = { [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, [1] = { .irq = 1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, [2] = { .irq = 2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, @@ -139,48 +138,26 @@ static struct irq_cfg irq_cfg_legacy[] __initdata = { [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, }; -static struct irq_cfg irq_cfg_init = { .irq = -1U, }; - -static void init_one_irq_cfg(struct irq_cfg *cfg) -{ - memcpy(cfg, &irq_cfg_init, sizeof(struct irq_cfg)); -} - -static struct irq_cfg *irq_cfgx; - -static void __init init_work(void *data) -{ - struct dyn_array *da = data; - struct irq_cfg *cfg; - int legacy_count; - int i; - - cfg = *da->name; - - memcpy(cfg, irq_cfg_legacy, sizeof(irq_cfg_legacy)); - - legacy_count = ARRAY_SIZE(irq_cfg_legacy); - for (i = legacy_count; i < *da->nr; i++) - init_one_irq_cfg(&cfg[i]); -} - #define for_each_irq_cfg(irq, cfg) \ - for (irq = 0, cfg = &irq_cfgx[irq]; irq < nr_irqs; irq++, cfg = &irq_cfgx[irq]) + for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++) -DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irqs, PAGE_SIZE, init_work); - -struct irq_cfg *irq_cfg(unsigned int irq) +static struct irq_cfg *irq_cfg(unsigned int irq) { - if (irq < nr_irqs) - return &irq_cfgx[irq]; - - return NULL; + return irq < nr_irqs ? irq_cfgx + irq : NULL; } -struct irq_cfg *irq_cfg_alloc(unsigned int irq) + +static struct irq_cfg *irq_cfg_alloc(unsigned int irq) { return irq_cfg(irq); } +/* + * Rough estimation of how many shared IRQs there are, can be changed + * anytime. + */ +#define MAX_PLUS_SHARED_IRQS NR_IRQS +#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) + /* * This is performance-critical, we want to do it O(1) * @@ -193,59 +170,29 @@ struct irq_pin_list { struct irq_pin_list *next; }; -static struct irq_pin_list *irq_2_pin_head; -/* fill one page ? */ -static int nr_irq_2_pin = 0x100; +static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE]; static struct irq_pin_list *irq_2_pin_ptr; -static void __init irq_2_pin_init_work(void *data) + +static void __init irq_2_pin_init(void) { - struct dyn_array *da = data; - struct irq_pin_list *pin; + struct irq_pin_list *pin = irq_2_pin_head; int i; - pin = *da->name; - - for (i = 1; i < *da->nr; i++) + for (i = 1; i < PIN_MAP_SIZE; i++) pin[i-1].next = &pin[i]; irq_2_pin_ptr = &pin[0]; } -DEFINE_DYN_ARRAY(irq_2_pin_head, sizeof(struct irq_pin_list), nr_irq_2_pin, PAGE_SIZE, irq_2_pin_init_work); static struct irq_pin_list *get_one_free_irq_2_pin(void) { - struct irq_pin_list *pin; - int i; - - pin = irq_2_pin_ptr; - - if (pin) { - irq_2_pin_ptr = pin->next; - pin->next = NULL; - return pin; - } - - /* - * we run out of pre-allocate ones, allocate more - */ - printk(KERN_DEBUG "try to get more irq_2_pin %d\n", nr_irq_2_pin); - - if (after_bootmem) - pin = kzalloc(sizeof(struct irq_pin_list)*nr_irq_2_pin, - GFP_ATOMIC); - else - pin = __alloc_bootmem_nopanic(sizeof(struct irq_pin_list) * - nr_irq_2_pin, PAGE_SIZE, 0); + struct irq_pin_list *pin = irq_2_pin_ptr; if (!pin) panic("can not get more irq_2_pin\n"); - for (i = 1; i < nr_irq_2_pin; i++) - pin[i-1].next = &pin[i]; - irq_2_pin_ptr = pin->next; pin->next = NULL; - return pin; } @@ -284,8 +231,9 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) { struct io_apic __iomem *io_apic = io_apic_base(apic); - if (sis_apic_bug) - writel(reg, &io_apic->index); + + if (sis_apic_bug) + writel(reg, &io_apic->index); writel(value, &io_apic->data); } @@ -1044,11 +992,11 @@ static int pin_2_irq(int idx, int apic, int pin) while (i < apic) irq += nr_ioapic_registers[i++]; irq += pin; - /* + /* * For MPS mode, so far only needed by ES7000 platform */ - if (ioapic_renumber_irq) - irq = ioapic_renumber_irq(apic, irq); + if (ioapic_renumber_irq) + irq = ioapic_renumber_irq(apic, irq); } #ifdef CONFIG_X86_32 @@ -1232,19 +1180,19 @@ static struct irq_chip ir_ioapic_chip; #ifdef CONFIG_X86_32 static inline int IO_APIC_irq_trigger(int irq) { - int apic, idx, pin; + int apic, idx, pin; - for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - idx = find_irq_entry(apic, pin, mp_INT); - if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) - return irq_trigger(idx); - } - } - /* + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + idx = find_irq_entry(apic, pin, mp_INT); + if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) + return irq_trigger(idx); + } + } + /* * nonexistent IRQs are edge default */ - return 0; + return 0; } #else static inline int IO_APIC_irq_trigger(int irq) @@ -1509,8 +1457,8 @@ __apicdebuginit(void) print_IO_APIC(void) reg_01.raw = io_apic_read(apic, 1); if (reg_01.bits.version >= 0x10) reg_02.raw = io_apic_read(apic, 2); - if (reg_01.bits.version >= 0x20) - reg_03.raw = io_apic_read(apic, 3); + if (reg_01.bits.version >= 0x20) + reg_03.raw = io_apic_read(apic, 3); spin_unlock_irqrestore(&ioapic_lock, flags); printk("\n"); @@ -2089,9 +2037,9 @@ static int ioapic_retrigger_irq(unsigned int irq) #else static int ioapic_retrigger_irq(unsigned int irq) { - send_IPI_self(irq_cfg(irq)->vector); + send_IPI_self(irq_cfg(irq)->vector); - return 1; + return 1; } #endif @@ -2189,7 +2137,7 @@ static int migrate_irq_remapped_level(int irq) if (io_apic_level_ack_pending(irq)) { /* - * Interrupt in progress. Migrating irq now will change the + * Interrupt in progress. Migrating irq now will change the * vector information in the IO-APIC RTE and that will confuse * the EOI broadcast performed by cpu. * So, delay the irq migration to the next instance. @@ -2426,28 +2374,28 @@ static void ack_apic_level(unsigned int irq) } static struct irq_chip ioapic_chip __read_mostly = { - .name = "IO-APIC", - .startup = startup_ioapic_irq, - .mask = mask_IO_APIC_irq, - .unmask = unmask_IO_APIC_irq, - .ack = ack_apic_edge, - .eoi = ack_apic_level, + .name = "IO-APIC", + .startup = startup_ioapic_irq, + .mask = mask_IO_APIC_irq, + .unmask = unmask_IO_APIC_irq, + .ack = ack_apic_edge, + .eoi = ack_apic_level, #ifdef CONFIG_SMP - .set_affinity = set_ioapic_affinity_irq, + .set_affinity = set_ioapic_affinity_irq, #endif .retrigger = ioapic_retrigger_irq, }; #ifdef CONFIG_INTR_REMAP static struct irq_chip ir_ioapic_chip __read_mostly = { - .name = "IR-IO-APIC", - .startup = startup_ioapic_irq, - .mask = mask_IO_APIC_irq, - .unmask = unmask_IO_APIC_irq, - .ack = ack_x2apic_edge, - .eoi = ack_x2apic_level, + .name = "IR-IO-APIC", + .startup = startup_ioapic_irq, + .mask = mask_IO_APIC_irq, + .unmask = unmask_IO_APIC_irq, + .ack = ack_x2apic_edge, + .eoi = ack_x2apic_level, #ifdef CONFIG_SMP - .set_affinity = set_ir_ioapic_affinity_irq, + .set_affinity = set_ir_ioapic_affinity_irq, #endif .retrigger = ioapic_retrigger_irq, }; @@ -2636,8 +2584,8 @@ static inline void __init check_timer(void) local_irq_save(flags); - ver = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(ver); + ver = apic_read(APIC_LVR); + ver = GET_APIC_VERSION(ver); /* * get/set the timer IRQ vector: @@ -2822,12 +2770,12 @@ void __init setup_IO_APIC(void) io_apic_irqs = ~PIC_IRQS; apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); - /* + /* * Set up IO-APIC IRQ routing. */ #ifdef CONFIG_X86_32 - if (!acpi_ioapic) - setup_ioapic_ids_from_mpc(); + if (!acpi_ioapic) + setup_ioapic_ids_from_mpc(); #endif sync_Arb_IDs(); setup_IO_APIC_irqs(); @@ -2842,9 +2790,9 @@ void __init setup_IO_APIC(void) static int __init io_apic_bug_finalize(void) { - if (sis_apic_bug == -1) - sis_apic_bug = 0; - return 0; + if (sis_apic_bug == -1) + sis_apic_bug = 0; + return 0; } late_initcall(io_apic_bug_finalize); @@ -3199,7 +3147,7 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) if (index < 0) { printk(KERN_ERR "Unable to allocate %d IRTE for PCI %s\n", nvec, - pci_name(dev)); + pci_name(dev)); return -ENOSPC; } return index; @@ -3885,23 +3833,24 @@ static struct resource * __init ioapic_setup_resources(void) void __init ioapic_init_mappings(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; - int i; struct resource *ioapic_res; + int i; + irq_2_pin_init(); ioapic_res = ioapic_setup_resources(); for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { ioapic_phys = mp_ioapics[i].mp_apicaddr; #ifdef CONFIG_X86_32 - if (!ioapic_phys) { - printk(KERN_ERR - "WARNING: bogus zero IO-APIC " - "address found in MPTABLE, " - "disabling IO/APIC support!\n"); - smp_found_config = 0; - skip_ioapic_setup = 1; - goto fake_ioapic_page; - } + if (!ioapic_phys) { + printk(KERN_ERR + "WARNING: bogus zero IO-APIC " + "address found in MPTABLE, " + "disabling IO/APIC support!\n"); + smp_found_config = 0; + skip_ioapic_setup = 1; + goto fake_ioapic_page; + } #endif } else { #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 2b7dab699e8..410c88f0bfe 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -140,7 +140,7 @@ static void __init setup_cpu_pda_map(void) */ void __init setup_per_cpu_areas(void) { - ssize_t size, old_size, da_size; + ssize_t size, old_size; char *ptr; int cpu; unsigned long align = 1; @@ -150,9 +150,8 @@ void __init setup_per_cpu_areas(void) /* Copy section for each CPU (we discard the original) */ old_size = PERCPU_ENOUGH_ROOM; - da_size = per_cpu_dyn_array_size(&align); align = max_t(unsigned long, PAGE_SIZE, align); - size = roundup(old_size + da_size, align); + size = roundup(old_size, align); printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n", size); @@ -182,9 +181,6 @@ void __init setup_per_cpu_areas(void) #endif per_cpu_offset(cpu) = ptr - __per_cpu_start; memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); - - per_cpu_alloc_dyn_array(cpu, ptr + old_size); - } printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n", diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 817aa55a120..0c9667f0752 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -633,7 +633,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) /* * handle this 'virtual interrupt' as a Cobalt one now. */ - kstat_irqs_this_cpu(desc)++; + kstat_incr_irqs_this_cpu(realirq, desc); if (likely(desc->action != NULL)) handle_IRQ_event(realirq, desc->action); diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index c36007ab394..a9b8560adbc 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -145,7 +145,6 @@ SECTIONS *(.x86_cpu_dev.init) __x86_cpu_dev_end = .; } - DYN_ARRAY_INIT(8) SECURITY_INIT . = ALIGN(4); .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 30973dbac8c..3245ad72594 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -174,8 +174,6 @@ SECTIONS } __x86_cpu_dev_end = .; - DYN_ARRAY_INIT(8) - SECURITY_INIT . = ALIGN(8); diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index bb6bc721b13..5601506f2dd 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -241,7 +241,7 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq)); } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */ - kstat_irqs_this_cpu(irq_to_desc(irq))++; + kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); out: raw_local_irq_restore(flags); diff --git a/drivers/char/random.c b/drivers/char/random.c index 9ce80213007..1137d297604 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -558,12 +558,7 @@ struct timer_rand_state { unsigned dont_count_entropy:1; }; -#ifdef CONFIG_HAVE_DYN_ARRAY -static struct timer_rand_state **irq_timer_state; -DEFINE_DYN_ARRAY(irq_timer_state, sizeof(struct timer_rand_state *), nr_irqs, PAGE_SIZE, NULL); -#else static struct timer_rand_state *irq_timer_state[NR_IRQS]; -#endif static struct timer_rand_state *get_timer_rand_state(unsigned int irq) { diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c index 0f43b265eee..950769e8747 100644 --- a/drivers/pci/intr_remapping.c +++ b/drivers/pci/intr_remapping.c @@ -19,20 +19,13 @@ struct irq_2_iommu { u8 irte_mask; }; -#ifdef CONFIG_HAVE_DYN_ARRAY -static struct irq_2_iommu *irq_2_iommuX; -DEFINE_DYN_ARRAY(irq_2_iommuX, sizeof(struct irq_2_iommu), nr_irqs, PAGE_SIZE, NULL); -#else static struct irq_2_iommu irq_2_iommuX[NR_IRQS]; -#endif static struct irq_2_iommu *irq_2_iommu(unsigned int irq) { - if (irq < nr_irqs) - return &irq_2_iommuX[irq]; - - return NULL; + return (irq < nr_irqs) ?: irq_2_iommuX + irq : NULL; } + static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq) { return irq_2_iommu(irq); diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index c68eda9d9a9..7440a0dcedd 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -210,19 +210,6 @@ * All archs are supposed to use RO_DATA() */ #define RODATA RO_DATA(4096) -#define DYN_ARRAY_INIT(align) \ - . = ALIGN((align)); \ - .dyn_array.init : AT(ADDR(.dyn_array.init) - LOAD_OFFSET) { \ - VMLINUX_SYMBOL(__dyn_array_start) = .; \ - *(.dyn_array.init) \ - VMLINUX_SYMBOL(__dyn_array_end) = .; \ - } \ - . = ALIGN((align)); \ - .per_cpu_dyn_array.init : AT(ADDR(.per_cpu_dyn_array.init) - LOAD_OFFSET) { \ - VMLINUX_SYMBOL(__per_cpu_dyn_array_start) = .; \ - *(.per_cpu_dyn_array.init) \ - VMLINUX_SYMBOL(__per_cpu_dyn_array_end) = .; \ - } #define SECURITY_INIT \ .security_initcall.init : AT(ADDR(.security_initcall.init) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__security_initcall_start) = .; \ diff --git a/include/linux/init.h b/include/linux/init.h index 59fbb4aaba6..70ad53e1eab 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -247,49 +247,6 @@ struct obs_kernel_param { /* Relies on boot_command_line being set */ void __init parse_early_param(void); -struct dyn_array { - void **name; - unsigned long size; - unsigned int *nr; - unsigned long align; - void (*init_work)(void *); -}; -extern struct dyn_array *__dyn_array_start[], *__dyn_array_end[]; -extern struct dyn_array *__per_cpu_dyn_array_start[], *__per_cpu_dyn_array_end[]; - -#define DEFINE_DYN_ARRAY_ADDR(nameX, addrX, sizeX, nrX, alignX, init_workX) \ - static struct dyn_array __dyn_array_##nameX __initdata = \ - { .name = (void **)&(nameX),\ - .size = sizeX,\ - .nr = &(nrX),\ - .align = alignX,\ - .init_work = init_workX,\ - }; \ - static struct dyn_array *__dyn_array_ptr_##nameX __used \ - __attribute__((__section__(".dyn_array.init"))) = \ - &__dyn_array_##nameX - -#define DEFINE_DYN_ARRAY(nameX, sizeX, nrX, alignX, init_workX) \ - DEFINE_DYN_ARRAY_ADDR(nameX, nameX, sizeX, nrX, alignX, init_workX) - -#define DEFINE_PER_CPU_DYN_ARRAY_ADDR(nameX, addrX, sizeX, nrX, alignX, init_workX) \ - static struct dyn_array __per_cpu_dyn_array_##nameX __initdata = \ - { .name = (void **)&(addrX),\ - .size = sizeX,\ - .nr = &(nrX),\ - .align = alignX,\ - .init_work = init_workX,\ - }; \ - static struct dyn_array *__per_cpu_dyn_array_ptr_##nameX __used \ - __attribute__((__section__(".per_cpu_dyn_array.init"))) = \ - &__per_cpu_dyn_array_##nameX - -#define DEFINE_PER_CPU_DYN_ARRAY(nameX, sizeX, nrX, alignX, init_workX) \ - DEFINE_PER_CPU_DYN_ARRAY_ADDR(nameX, nameX, nrX, alignX, init_workX) - -extern void pre_alloc_dyn_array(void); -extern unsigned long per_cpu_dyn_array_size(unsigned long *align); -extern void per_cpu_alloc_dyn_array(int cpu, char *ptr); #endif /* __ASSEMBLY__ */ /** diff --git a/include/linux/irq.h b/include/linux/irq.h index 3f33c779030..38bf89f2ade 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -139,8 +139,6 @@ struct irq_chip { const char *typename; }; -struct timer_rand_state; -struct irq_2_iommu; /** * struct irq_desc - interrupt descriptor * @@ -167,9 +165,6 @@ struct irq_2_iommu; */ struct irq_desc { unsigned int irq; -#ifdef CONFIG_HAVE_DYN_ARRAY - unsigned int *kstat_irqs; -#endif irq_flow_handler_t handle_irq; struct irq_chip *chip; struct msi_desc *msi_desc; @@ -198,23 +193,13 @@ struct irq_desc { } ____cacheline_internodealigned_in_smp; -#ifndef CONFIG_HAVE_DYN_ARRAY -/* could be removed if we get rid of all irq_desc reference */ extern struct irq_desc irq_desc[NR_IRQS]; -#else -extern struct irq_desc *irq_desc; -#endif static inline struct irq_desc *irq_to_desc(unsigned int irq) { return (irq < nr_irqs) ? irq_desc + irq : NULL; } -#ifdef CONFIG_HAVE_DYN_ARRAY -#define kstat_irqs_this_cpu(DESC) \ - ((DESC)->kstat_irqs[smp_processor_id()]) -#endif - /* * Migration helpers for obsolete names, they will go away: */ diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 21249d8c129..a9d0d360b77 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -28,9 +28,7 @@ struct cpu_usage_stat { struct kernel_stat { struct cpu_usage_stat cpustat; -#ifndef CONFIG_HAVE_DYN_ARRAY unsigned int irqs[NR_IRQS]; -#endif }; DECLARE_PER_CPU(struct kernel_stat, kstat); @@ -41,20 +39,18 @@ DECLARE_PER_CPU(struct kernel_stat, kstat); extern unsigned long long nr_context_switches(void); -#ifndef CONFIG_HAVE_DYN_ARRAY -#define kstat_irqs_this_cpu(irq) \ - (kstat_this_cpu.irqs[irq]) -#endif +struct irq_desc; +static inline void kstat_incr_irqs_this_cpu(unsigned int irq, + struct irq_desc *desc) +{ + kstat_this_cpu.irqs[irq]++; +} -#ifndef CONFIG_HAVE_DYN_ARRAY static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) { return kstat_cpu(cpu).irqs[irq]; } -#else -extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu); -#endif /* * Number of interrupts per specific IRQ source, since bootup diff --git a/init/Makefile b/init/Makefile index dc5eeca6eb6..4a243df426f 100644 --- a/init/Makefile +++ b/init/Makefile @@ -2,7 +2,7 @@ # Makefile for the linux kernel. # -obj-y := main.o dyn_array.o version.o mounts.o +obj-y := main.o version.o mounts.o ifneq ($(CONFIG_BLK_DEV_INITRD),y) obj-y += noinitramfs.o else diff --git a/init/dyn_array.c b/init/dyn_array.c deleted file mode 100644 index c8d5e2a1858..00000000000 --- a/init/dyn_array.c +++ /dev/null @@ -1,120 +0,0 @@ -#include -#include -#include -#include -#include -#include - -void __init pre_alloc_dyn_array(void) -{ -#ifdef CONFIG_HAVE_DYN_ARRAY - unsigned long total_size = 0, size, phys; - unsigned long max_align = 1; - struct dyn_array **daa; - char *ptr; - - /* get the total size at first */ - for (daa = __dyn_array_start ; daa < __dyn_array_end; daa++) { - struct dyn_array *da = *daa; - - printk(KERN_DEBUG "dyn_array %pF size:%#lx nr:%d align:%#lx\n", - da->name, da->size, *da->nr, da->align); - size = da->size * (*da->nr); - total_size += roundup(size, da->align); - if (da->align > max_align) - max_align = da->align; - } - if (total_size) - printk(KERN_DEBUG "dyn_array total_size: %#lx\n", - total_size); - else - return; - - /* allocate them all together */ - max_align = max_t(unsigned long, max_align, PAGE_SIZE); - ptr = __alloc_bootmem(total_size, max_align, 0); - phys = virt_to_phys(ptr); - - for (daa = __dyn_array_start ; daa < __dyn_array_end; daa++) { - struct dyn_array *da = *daa; - - size = da->size * (*da->nr); - phys = roundup(phys, da->align); - printk(KERN_DEBUG "dyn_array %pF ==> [%#lx - %#lx]\n", - da->name, phys, phys + size); - *da->name = phys_to_virt(phys); - - phys += size; - - if (da->init_work) - da->init_work(da); - } -#else -#ifdef CONFIG_GENERIC_HARDIRQS - unsigned int i; - - for (i = 0; i < NR_IRQS; i++) - irq_desc[i].irq = i; -#endif -#endif -} - -unsigned long __init per_cpu_dyn_array_size(unsigned long *align) -{ - unsigned long total_size = 0; -#ifdef CONFIG_HAVE_DYN_ARRAY - unsigned long size; - struct dyn_array **daa; - unsigned max_align = 1; - - for (daa = __per_cpu_dyn_array_start ; daa < __per_cpu_dyn_array_end; daa++) { - struct dyn_array *da = *daa; - - printk(KERN_DEBUG "per_cpu_dyn_array %pF size:%#lx nr:%d align:%#lx\n", - da->name, da->size, *da->nr, da->align); - size = da->size * (*da->nr); - total_size += roundup(size, da->align); - if (da->align > max_align) - max_align = da->align; - } - if (total_size) { - printk(KERN_DEBUG "per_cpu_dyn_array total_size: %#lx\n", - total_size); - *align = max_align; - } -#endif - return total_size; -} - -#ifdef CONFIG_SMP -void __init per_cpu_alloc_dyn_array(int cpu, char *ptr) -{ -#ifdef CONFIG_HAVE_DYN_ARRAY - unsigned long size, phys; - struct dyn_array **daa; - unsigned long addr; - void **array; - - phys = virt_to_phys(ptr); - for (daa = __per_cpu_dyn_array_start ; daa < __per_cpu_dyn_array_end; daa++) { - struct dyn_array *da = *daa; - - size = da->size * (*da->nr); - phys = roundup(phys, da->align); - printk(KERN_DEBUG "per_cpu_dyn_array %pF ==> [%#lx - %#lx]\n", - da->name, phys, phys + size); - - addr = (unsigned long)da->name; - addr += per_cpu_offset(cpu); - array = (void **)addr; - *array = phys_to_virt(phys); - *da->name = *array; /* so init_work could use it directly */ - - phys += size; - - if (da->init_work) - da->init_work(da); - } -#endif -} -#endif diff --git a/init/main.c b/init/main.c index e81cf427d9c..27f6bf6108e 100644 --- a/init/main.c +++ b/init/main.c @@ -391,23 +391,17 @@ EXPORT_SYMBOL(__per_cpu_offset); static void __init setup_per_cpu_areas(void) { - unsigned long size, i, old_size; + unsigned long size, i; char *ptr; unsigned long nr_possible_cpus = num_possible_cpus(); - unsigned long align = 1; - unsigned da_size; /* Copy section for each CPU (we discard the original) */ - old_size = PERCPU_ENOUGH_ROOM; - da_size = per_cpu_dyn_array_size(&align); - align = max_t(unsigned long, PAGE_SIZE, align); - size = ALIGN(old_size + da_size, align); + size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE); ptr = alloc_bootmem_pages(size * nr_possible_cpus); for_each_possible_cpu(i) { __per_cpu_offset[i] = ptr - __per_cpu_start; memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); - per_cpu_alloc_dyn_array(i, ptr + old_size); ptr += size; } } @@ -573,7 +567,6 @@ asmlinkage void __init start_kernel(void) printk(KERN_NOTICE); printk(linux_banner); setup_arch(&command_line); - pre_alloc_dyn_array(); mm_init_owner(&init_mm, &init_task); setup_command_line(command_line); unwind_setup(); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index e6f73dbfcc3..d96d6f687c4 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -326,11 +326,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); -#ifdef CONFIG_HAVE_DYN_ARRAY - kstat_irqs_this_cpu(desc)++; -#else - kstat_irqs_this_cpu(irq)++; -#endif + kstat_incr_irqs_this_cpu(irq, desc); action = desc->action; if (unlikely(!action || (desc->status & IRQ_DISABLED))) @@ -371,11 +367,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); -#ifdef CONFIG_HAVE_DYN_ARRAY - kstat_irqs_this_cpu(desc)++; -#else - kstat_irqs_this_cpu(irq)++; -#endif + kstat_incr_irqs_this_cpu(irq, desc); /* * If its disabled or no action available @@ -422,11 +414,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) goto out; desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); -#ifdef CONFIG_HAVE_DYN_ARRAY - kstat_irqs_this_cpu(desc)++; -#else - kstat_irqs_this_cpu(irq)++; -#endif + kstat_incr_irqs_this_cpu(irq, desc); /* * If its disabled or no action available @@ -490,11 +478,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) mask_ack_irq(desc, irq); goto out_unlock; } -#ifdef CONFIG_HAVE_DYN_ARRAY - kstat_irqs_this_cpu(desc)++; -#else - kstat_irqs_this_cpu(irq)++; -#endif + kstat_incr_irqs_this_cpu(irq, desc); /* Start handling the irq */ desc->chip->ack(irq); @@ -549,11 +533,7 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) { irqreturn_t action_ret; -#ifdef CONFIG_HAVE_DYN_ARRAY - kstat_irqs_this_cpu(desc)++; -#else - kstat_irqs_this_cpu(irq)++; -#endif + kstat_incr_irqs_this_cpu(irq, desc); if (desc->chip->ack) desc->chip->ack(irq); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index f837133cdfb..9fe86b3a60a 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -18,11 +18,6 @@ #include "internals.h" -/* - * lockdep: we want to handle all irq_desc locks as a single lock-class: - */ -static struct lock_class_key irq_desc_lock_class; - /** * handle_bad_irq - handle spurious and unhandled irqs * @irq: the interrupt number @@ -30,15 +25,10 @@ static struct lock_class_key irq_desc_lock_class; * * Handles spurious and unhandled IRQ's. It also prints a debugmessage. */ -void -handle_bad_irq(unsigned int irq, struct irq_desc *desc) +void handle_bad_irq(unsigned int irq, struct irq_desc *desc) { print_irq_desc(irq, desc); -#ifdef CONFIG_HAVE_DYN_ARRAY - kstat_irqs_this_cpu(desc)++; -#else - kstat_irqs_this_cpu(irq)++; -#endif + kstat_incr_irqs_this_cpu(irq, desc); ack_bad_irq(irq); } @@ -59,80 +49,6 @@ handle_bad_irq(unsigned int irq, struct irq_desc *desc) int nr_irqs = NR_IRQS; EXPORT_SYMBOL_GPL(nr_irqs); -#ifdef CONFIG_HAVE_DYN_ARRAY -static struct irq_desc irq_desc_init = { - .irq = -1U, - .status = IRQ_DISABLED, - .chip = &no_irq_chip, - .handle_irq = handle_bad_irq, - .depth = 1, - .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), -#ifdef CONFIG_SMP - .affinity = CPU_MASK_ALL -#endif -}; - - -static void init_one_irq_desc(struct irq_desc *desc) -{ - memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); - lockdep_set_class(&desc->lock, &irq_desc_lock_class); -} - -extern int after_bootmem; -extern void *__alloc_bootmem_nopanic(unsigned long size, - unsigned long align, - unsigned long goal); - -static void init_kstat_irqs(struct irq_desc *desc, int nr_desc, int nr) -{ - unsigned long bytes, total_bytes; - char *ptr; - int i; - unsigned long phys; - - /* Compute how many bytes we need per irq and allocate them */ - bytes = nr * sizeof(unsigned int); - total_bytes = bytes * nr_desc; - if (after_bootmem) - ptr = kzalloc(total_bytes, GFP_ATOMIC); - else - ptr = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0); - - if (!ptr) - panic(" can not allocate kstat_irqs\n"); - - phys = __pa(ptr); - printk(KERN_DEBUG "kstat_irqs ==> [%#lx - %#lx]\n", phys, phys + total_bytes); - - for (i = 0; i < nr_desc; i++) { - desc[i].kstat_irqs = (unsigned int *)ptr; - ptr += bytes; - } -} - -static void __init init_work(void *data) -{ - struct dyn_array *da = data; - int i; - struct irq_desc *desc; - - desc = *da->name; - - for (i = 0; i < *da->nr; i++) { - init_one_irq_desc(&desc[i]); - desc[i].irq = i; - } - - /* init kstat_irqs, nr_cpu_ids is ready already */ - init_kstat_irqs(desc, *da->nr, nr_cpu_ids); -} - -struct irq_desc *irq_desc; -DEFINE_DYN_ARRAY(irq_desc, sizeof(struct irq_desc), nr_irqs, PAGE_SIZE, init_work); - -#else - struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { [0 ... NR_IRQS-1] = { .status = IRQ_DISABLED, @@ -146,8 +62,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { } }; -#endif - /* * What should we do if we get a hw irq event on an illegal vector? * Each architecture has to answer this themself. @@ -258,11 +172,8 @@ unsigned int __do_IRQ(unsigned int irq) struct irqaction *action; unsigned int status; -#ifdef CONFIG_HAVE_DYN_ARRAY - kstat_irqs_this_cpu(desc)++; -#else - kstat_irqs_this_cpu(irq)++; -#endif + kstat_incr_irqs_this_cpu(irq, desc); + if (CHECK_IRQ_PER_CPU(desc->status)) { irqreturn_t action_ret; @@ -351,23 +262,16 @@ out: #ifdef CONFIG_TRACE_IRQFLAGS +/* + * lockdep: we want to handle all irq_desc locks as a single lock-class: + */ +static struct lock_class_key irq_desc_lock_class; + void early_init_irq_lock_class(void) { -#ifndef CONFIG_HAVE_DYN_ARRAY int i; for (i = 0; i < nr_irqs; i++) lockdep_set_class(&irq_desc[i].lock, &irq_desc_lock_class); -#endif } #endif - -#ifdef CONFIG_HAVE_DYN_ARRAY -unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) -{ - struct irq_desc *desc = irq_to_desc(irq); - return desc->kstat_irqs[cpu]; -} -#endif -EXPORT_SYMBOL(kstat_irqs_cpu); - -- cgit v1.2.3-70-g09d2 From a1aca5de08a0cb840a90fb3f729a5940f8d21185 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 15 Oct 2008 19:29:15 +0200 Subject: genirq: remove artifacts from sparseirq removal Signed-off-by: Ingo Molnar --- arch/Kconfig | 1 + arch/x86/kernel/acpi/boot.c | 1 - arch/x86/kernel/irqinit_32.c | 1 - arch/x86/kernel/vmlinux_64.lds.S | 1 - arch/x86/mm/init_32.c | 3 --- include/linux/init.h | 1 - include/linux/kernel_stat.h | 2 +- 7 files changed, 2 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/Kconfig b/arch/Kconfig index 071004d3a1b..0267babe5eb 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -102,3 +102,4 @@ config HAVE_CLK help The calls support software clock gating and thus are a key power management tool on many systems. + diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 5fef4fece4a..0d1c26a583c 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1254,7 +1254,6 @@ static int __init acpi_parse_madt_ioapic_entries(void) return count; } - count = acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr, nr_irqs); diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index a8d35998d30..845aa9803e8 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -184,4 +184,3 @@ void __init native_init_IRQ(void) irq_ctx_init(smp_processor_id()); } - diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 3245ad72594..46e05447405 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -173,7 +173,6 @@ SECTIONS *(.x86_cpu_dev.init) } __x86_cpu_dev_end = .; - SECURITY_INIT . = ALIGN(8); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 91343c2694b..8396868e82c 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -66,7 +66,6 @@ static unsigned long __meminitdata table_end; static unsigned long __meminitdata table_top; static int __initdata after_init_bootmem; -int after_bootmem; static __init void *alloc_low_page(unsigned long *phys) { @@ -989,8 +988,6 @@ void __init mem_init(void) set_highmem_pages_init(); - after_bootmem = 1; - codesize = (unsigned long) &_etext - (unsigned long) &_text; datasize = (unsigned long) &_edata - (unsigned long) &_etext; initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; diff --git a/include/linux/init.h b/include/linux/init.h index 70ad53e1eab..93538b696e3 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -246,7 +246,6 @@ struct obs_kernel_param { /* Relies on boot_command_line being set */ void __init parse_early_param(void); - #endif /* __ASSEMBLY__ */ /** diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index a9d0d360b77..89b6ecd4147 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -28,7 +28,7 @@ struct cpu_usage_stat { struct kernel_stat { struct cpu_usage_stat cpustat; - unsigned int irqs[NR_IRQS]; + unsigned int irqs[NR_IRQS]; }; DECLARE_PER_CPU(struct kernel_stat, kstat); -- cgit v1.2.3-70-g09d2 From c0c168ca26b54a4a6ad34fc813fe00f275fbc94c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 16 Oct 2008 11:15:28 +0200 Subject: x86: cleanup show_interrupts The sparseirq patches introduced some more ugliness in show_interrupts(). Clean it up all together and make the code easier to read by splitting out the "tail" function which prints the special interrupts. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/irq_32.c | 160 +++++++++++++++++++++++------------------------ arch/x86/kernel/irq_64.c | 153 ++++++++++++++++++++++---------------------- 2 files changed, 154 insertions(+), 159 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index ccf6c1bf712..8d525765a6c 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -263,22 +263,67 @@ atomic_t irq_err_count; * /proc/interrupts printing: */ +static int show_other_interrupts(struct seq_file *p) +{ + int j; + + seq_printf(p, "NMI: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", nmi_count(j)); + seq_printf(p, " Non-maskable interrupts\n"); +#ifdef CONFIG_X86_LOCAL_APIC + seq_printf(p, "LOC: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(irq_stat,j).apic_timer_irqs); + seq_printf(p, " Local timer interrupts\n"); +#endif +#ifdef CONFIG_SMP + seq_printf(p, "RES: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(irq_stat,j).irq_resched_count); + seq_printf(p, " Rescheduling interrupts\n"); + seq_printf(p, "CAL: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(irq_stat,j).irq_call_count); + seq_printf(p, " Function call interrupts\n"); + seq_printf(p, "TLB: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(irq_stat,j).irq_tlb_count); + seq_printf(p, " TLB shootdowns\n"); +#endif +#ifdef CONFIG_X86_MCE + seq_printf(p, "TRM: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(irq_stat,j).irq_thermal_count); + seq_printf(p, " Thermal event interrupts\n"); +#endif +#ifdef CONFIG_X86_LOCAL_APIC + seq_printf(p, "SPU: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(irq_stat,j).irq_spurious_count); + seq_printf(p, " Spurious interrupts\n"); +#endif + seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); +#if defined(CONFIG_X86_IO_APIC) + seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); +#endif + return 0; +} + int show_interrupts(struct seq_file *p, void *v) { + unsigned long flags, any_count = 0; int i = *(loff_t *) v, j; - struct irqaction * action; - unsigned long flags; - unsigned int entries; - struct irq_desc *desc = NULL; - int tail = 0; + struct irqaction *action; + struct irq_desc *desc; + + if (i > nr_irqs) + return 0; - entries = nr_irqs - 1; - i = *(loff_t *) v; if (i == nr_irqs) - tail = 1; - else - desc = irq_to_desc(i); + return show_other_interrupts(p); + /* print header */ if (i == 0) { seq_printf(p, " "); for_each_online_cpu(j) @@ -286,88 +331,37 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); } - if (i <= entries) { - unsigned any_count = 0; - - spin_lock_irqsave(&desc->lock, flags); + desc = irq_to_desc(i); + spin_lock_irqsave(&desc->lock, flags); #ifndef CONFIG_SMP - any_count = kstat_irqs(i); + any_count = kstat_irqs(i); #else - for_each_online_cpu(j) - any_count |= kstat_irqs_cpu(i, j); + for_each_online_cpu(j) + any_count |= kstat_irqs_cpu(i, j); #endif - action = desc->action; - if (!action && !any_count) - goto skip; - seq_printf(p, "%3d: ", i); + action = desc->action; + if (!action && !any_count) + goto out; + + seq_printf(p, "%3d: ", i); #ifndef CONFIG_SMP - seq_printf(p, "%10u ", kstat_irqs(i)); + seq_printf(p, "%10u ", kstat_irqs(i)); #else - for_each_online_cpu(j) - seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); + for_each_online_cpu(j) + seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); #endif - seq_printf(p, " %8s", desc->chip->name); - seq_printf(p, "-%-8s", desc->name); - - if (action) { - seq_printf(p, " %s", action->name); - while ((action = action->next) != NULL) - seq_printf(p, ", %s", action->name); - } + seq_printf(p, " %8s", desc->chip->name); + seq_printf(p, "-%-8s", desc->name); - seq_putc(p, '\n'); -skip: - spin_unlock_irqrestore(&desc->lock, flags); + if (action) { + seq_printf(p, " %s", action->name); + while ((action = action->next) != NULL) + seq_printf(p, ", %s", action->name); } - if (tail) { - seq_printf(p, "NMI: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", nmi_count(j)); - seq_printf(p, " Non-maskable interrupts\n"); -#ifdef CONFIG_X86_LOCAL_APIC - seq_printf(p, "LOC: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", - per_cpu(irq_stat,j).apic_timer_irqs); - seq_printf(p, " Local timer interrupts\n"); -#endif -#ifdef CONFIG_SMP - seq_printf(p, "RES: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", - per_cpu(irq_stat,j).irq_resched_count); - seq_printf(p, " Rescheduling interrupts\n"); - seq_printf(p, "CAL: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", - per_cpu(irq_stat,j).irq_call_count); - seq_printf(p, " Function call interrupts\n"); - seq_printf(p, "TLB: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", - per_cpu(irq_stat,j).irq_tlb_count); - seq_printf(p, " TLB shootdowns\n"); -#endif -#ifdef CONFIG_X86_MCE - seq_printf(p, "TRM: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", - per_cpu(irq_stat,j).irq_thermal_count); - seq_printf(p, " Thermal event interrupts\n"); -#endif -#ifdef CONFIG_X86_LOCAL_APIC - seq_printf(p, "SPU: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", - per_cpu(irq_stat,j).irq_spurious_count); - seq_printf(p, " Spurious interrupts\n"); -#endif - seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); -#if defined(CONFIG_X86_IO_APIC) - seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); -#endif - } + seq_putc(p, '\n'); +out: + spin_unlock_irqrestore(&desc->lock, flags); return 0; } diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 21f53b91111..4f374294f29 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -68,22 +68,65 @@ static inline void stack_overflow_check(struct pt_regs *regs) * Generic, controller-independent functions: */ +static int show_other_interrupts(struct seq_file *p) +{ + int j; + + seq_printf(p, "NMI: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count); + seq_printf(p, " Non-maskable interrupts\n"); + seq_printf(p, "LOC: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs); + seq_printf(p, " Local timer interrupts\n"); +#ifdef CONFIG_SMP + seq_printf(p, "RES: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_pda(j)->irq_resched_count); + seq_printf(p, " Rescheduling interrupts\n"); + seq_printf(p, "CAL: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count); + seq_printf(p, " Function call interrupts\n"); + seq_printf(p, "TLB: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count); + seq_printf(p, " TLB shootdowns\n"); +#endif +#ifdef CONFIG_X86_MCE + seq_printf(p, "TRM: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count); + seq_printf(p, " Thermal event interrupts\n"); + seq_printf(p, "THR: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count); + seq_printf(p, " Threshold APIC interrupts\n"); +#endif + seq_printf(p, "SPU: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count); + seq_printf(p, " Spurious interrupts\n"); + seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); + + return 0; +} + int show_interrupts(struct seq_file *p, void *v) { - int i, j; - struct irqaction * action; - unsigned long flags; - unsigned int entries; - struct irq_desc *desc = NULL; - int tail = 0; - - entries = nr_irqs - 1; - i = *(loff_t *) v; + unsigned long flags, any_count = 0; + int i = *(loff_t *) v, j; + struct irqaction *action; + struct irq_desc *desc; + + if (i > nr_irqs) + return 0; + if (i == nr_irqs) - tail = 1; - else - desc = irq_to_desc(i); + return show_other_interrupts(p); + /* print header */ if (i == 0) { seq_printf(p, " "); for_each_online_cpu(j) @@ -91,79 +134,37 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); } - if (i <= entries) { - unsigned any_count = 0; - - spin_lock_irqsave(&desc->lock, flags); + desc = irq_to_desc(i); + spin_lock_irqsave(&desc->lock, flags); #ifndef CONFIG_SMP - any_count = kstat_irqs(i); + any_count = kstat_irqs(i); #else - for_each_online_cpu(j) - any_count |= kstat_irqs_cpu(i, j); + for_each_online_cpu(j) + any_count |= kstat_irqs_cpu(i, j); #endif - action = desc->action; - if (!action && !any_count) - goto skip; - seq_printf(p, "%3d: ", i); + action = desc->action; + if (!action && !any_count) + goto out; + + seq_printf(p, "%3d: ", i); #ifndef CONFIG_SMP - seq_printf(p, "%10u ", kstat_irqs(i)); + seq_printf(p, "%10u ", kstat_irqs(i)); #else - for_each_online_cpu(j) - seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); + for_each_online_cpu(j) + seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); #endif - seq_printf(p, " %8s", desc->chip->name); - seq_printf(p, "-%-8s", desc->name); + seq_printf(p, " %8s", desc->chip->name); + seq_printf(p, "-%-8s", desc->name); - if (action) { - seq_printf(p, " %s", action->name); - while ((action = action->next) != NULL) - seq_printf(p, ", %s", action->name); - } - seq_putc(p, '\n'); -skip: - spin_unlock_irqrestore(&desc->lock, flags); - } - - if (tail) { - seq_printf(p, "NMI: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count); - seq_printf(p, " Non-maskable interrupts\n"); - seq_printf(p, "LOC: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs); - seq_printf(p, " Local timer interrupts\n"); -#ifdef CONFIG_SMP - seq_printf(p, "RES: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_resched_count); - seq_printf(p, " Rescheduling interrupts\n"); - seq_printf(p, "CAL: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count); - seq_printf(p, " Function call interrupts\n"); - seq_printf(p, "TLB: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count); - seq_printf(p, " TLB shootdowns\n"); -#endif -#ifdef CONFIG_X86_MCE - seq_printf(p, "TRM: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count); - seq_printf(p, " Thermal event interrupts\n"); - seq_printf(p, "THR: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count); - seq_printf(p, " Threshold APIC interrupts\n"); -#endif - seq_printf(p, "SPU: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count); - seq_printf(p, " Spurious interrupts\n"); - seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); + if (action) { + seq_printf(p, " %s", action->name); + while ((action = action->next) != NULL) + seq_printf(p, ", %s", action->name); } + seq_putc(p, '\n'); +out: + spin_unlock_irqrestore(&desc->lock, flags); return 0; } -- cgit v1.2.3-70-g09d2 From 6b39ba771e3c78d00e0abcebad270bd4212b28bc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 16 Oct 2008 11:32:24 +0200 Subject: x86: unify show_interrupts() and proc helpers show_interrupts() and proc helpers are basically the same for 32 and 64 bit. Move them to a shared source file. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/irq.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/irq_32.c | 146 ----------------------------------------- arch/x86/kernel/irq_64.c | 132 ------------------------------------- 4 files changed, 167 insertions(+), 279 deletions(-) create mode 100644 arch/x86/kernel/irq.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index cb6ade443f0..d7e5a58ee22 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -23,7 +23,7 @@ CFLAGS_hpet.o := $(nostackp) CFLAGS_tsc.o := $(nostackp) obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o -obj-y += traps.o irq_$(BITS).o dumpstack_$(BITS).o +obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o obj-$(CONFIG_X86_VISWS) += visws_quirks.o diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c new file mode 100644 index 00000000000..3b912849897 --- /dev/null +++ b/arch/x86/kernel/irq.c @@ -0,0 +1,166 @@ +/* + * Common interrupt code for 32 and 64 bit + */ +#include +#include +#include +#include + +#include +#include +#include + +atomic_t irq_err_count; + +#ifdef CONFIG_X86_32 +# define irq_stats(x) (&per_cpu(irq_stat,x)) +#else +# define irq_stats(x) cpu_pda(x) +#endif +/* + * /proc/interrupts printing: + */ +static int show_other_interrupts(struct seq_file *p) +{ + int j; + + seq_printf(p, "NMI: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->__nmi_count); + seq_printf(p, " Non-maskable interrupts\n"); +#ifdef CONFIG_X86_LOCAL_APIC + seq_printf(p, "LOC: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); + seq_printf(p, " Local timer interrupts\n"); +#endif +#ifdef CONFIG_SMP + seq_printf(p, "RES: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count); + seq_printf(p, " Rescheduling interrupts\n"); + seq_printf(p, "CAL: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_call_count); + seq_printf(p, " Function call interrupts\n"); + seq_printf(p, "TLB: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); + seq_printf(p, " TLB shootdowns\n"); +#endif +#ifdef CONFIG_X86_MCE + seq_printf(p, "TRM: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); + seq_printf(p, " Thermal event interrupts\n"); +# ifdef CONFIG_X86_64 + seq_printf(p, "THR: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); + seq_printf(p, " Threshold APIC interrupts\n"); +# endif +#endif +#ifdef CONFIG_X86_LOCAL_APIC + seq_printf(p, "SPU: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); + seq_printf(p, " Spurious interrupts\n"); +#endif + seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); +#if defined(CONFIG_X86_IO_APIC) + seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); +#endif + return 0; +} + +int show_interrupts(struct seq_file *p, void *v) +{ + unsigned long flags, any_count = 0; + int i = *(loff_t *) v, j; + struct irqaction *action; + struct irq_desc *desc; + + if (i > nr_irqs) + return 0; + + if (i == nr_irqs) + return show_other_interrupts(p); + + /* print header */ + if (i == 0) { + seq_printf(p, " "); + for_each_online_cpu(j) + seq_printf(p, "CPU%-8d",j); + seq_putc(p, '\n'); + } + + desc = irq_to_desc(i); + spin_lock_irqsave(&desc->lock, flags); +#ifndef CONFIG_SMP + any_count = kstat_irqs(i); +#else + for_each_online_cpu(j) + any_count |= kstat_irqs_cpu(i, j); +#endif + action = desc->action; + if (!action && !any_count) + goto out; + + seq_printf(p, "%3d: ", i); +#ifndef CONFIG_SMP + seq_printf(p, "%10u ", kstat_irqs(i)); +#else + for_each_online_cpu(j) + seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); +#endif + seq_printf(p, " %8s", desc->chip->name); + seq_printf(p, "-%-8s", desc->name); + + if (action) { + seq_printf(p, " %s", action->name); + while ((action = action->next) != NULL) + seq_printf(p, ", %s", action->name); + } + + seq_putc(p, '\n'); +out: + spin_unlock_irqrestore(&desc->lock, flags); + return 0; +} + +/* + * /proc/stat helpers + */ +u64 arch_irq_stat_cpu(unsigned int cpu) +{ + u64 sum = irq_stats(cpu)->__nmi_count; + +#ifdef CONFIG_X86_LOCAL_APIC + sum += irq_stats(cpu)->apic_timer_irqs; +#endif +#ifdef CONFIG_SMP + sum += irq_stats(cpu)->irq_resched_count; + sum += irq_stats(cpu)->irq_call_count; + sum += irq_stats(cpu)->irq_tlb_count; +#endif +#ifdef CONFIG_X86_MCE + sum += irq_stats(cpu)->irq_thermal_count; +# ifdef CONFIG_X86_64 + sum += irq_stats(cpu)->irq_threshold_count; +#endif +#endif +#ifdef CONFIG_X86_LOCAL_APIC + sum += irq_stats(cpu)->irq_spurious_count; +#endif + return sum; +} + +u64 arch_irq_stat(void) +{ + u64 sum = atomic_read(&irq_err_count); + +#ifdef CONFIG_X86_IO_APIC + sum += atomic_read(&irq_mis_count); +#endif + return sum; +} diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 8d525765a6c..6d9bf3936c7 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -253,152 +253,6 @@ unsigned int do_IRQ(struct pt_regs *regs) return 1; } -/* - * Interrupt statistics: - */ - -atomic_t irq_err_count; - -/* - * /proc/interrupts printing: - */ - -static int show_other_interrupts(struct seq_file *p) -{ - int j; - - seq_printf(p, "NMI: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", nmi_count(j)); - seq_printf(p, " Non-maskable interrupts\n"); -#ifdef CONFIG_X86_LOCAL_APIC - seq_printf(p, "LOC: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", per_cpu(irq_stat,j).apic_timer_irqs); - seq_printf(p, " Local timer interrupts\n"); -#endif -#ifdef CONFIG_SMP - seq_printf(p, "RES: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", per_cpu(irq_stat,j).irq_resched_count); - seq_printf(p, " Rescheduling interrupts\n"); - seq_printf(p, "CAL: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", per_cpu(irq_stat,j).irq_call_count); - seq_printf(p, " Function call interrupts\n"); - seq_printf(p, "TLB: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", per_cpu(irq_stat,j).irq_tlb_count); - seq_printf(p, " TLB shootdowns\n"); -#endif -#ifdef CONFIG_X86_MCE - seq_printf(p, "TRM: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", per_cpu(irq_stat,j).irq_thermal_count); - seq_printf(p, " Thermal event interrupts\n"); -#endif -#ifdef CONFIG_X86_LOCAL_APIC - seq_printf(p, "SPU: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", per_cpu(irq_stat,j).irq_spurious_count); - seq_printf(p, " Spurious interrupts\n"); -#endif - seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); -#if defined(CONFIG_X86_IO_APIC) - seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); -#endif - return 0; -} - -int show_interrupts(struct seq_file *p, void *v) -{ - unsigned long flags, any_count = 0; - int i = *(loff_t *) v, j; - struct irqaction *action; - struct irq_desc *desc; - - if (i > nr_irqs) - return 0; - - if (i == nr_irqs) - return show_other_interrupts(p); - - /* print header */ - if (i == 0) { - seq_printf(p, " "); - for_each_online_cpu(j) - seq_printf(p, "CPU%-8d",j); - seq_putc(p, '\n'); - } - - desc = irq_to_desc(i); - spin_lock_irqsave(&desc->lock, flags); -#ifndef CONFIG_SMP - any_count = kstat_irqs(i); -#else - for_each_online_cpu(j) - any_count |= kstat_irqs_cpu(i, j); -#endif - action = desc->action; - if (!action && !any_count) - goto out; - - seq_printf(p, "%3d: ", i); -#ifndef CONFIG_SMP - seq_printf(p, "%10u ", kstat_irqs(i)); -#else - for_each_online_cpu(j) - seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); -#endif - seq_printf(p, " %8s", desc->chip->name); - seq_printf(p, "-%-8s", desc->name); - - if (action) { - seq_printf(p, " %s", action->name); - while ((action = action->next) != NULL) - seq_printf(p, ", %s", action->name); - } - - seq_putc(p, '\n'); -out: - spin_unlock_irqrestore(&desc->lock, flags); - return 0; -} - -/* - * /proc/stat helpers - */ -u64 arch_irq_stat_cpu(unsigned int cpu) -{ - u64 sum = nmi_count(cpu); - -#ifdef CONFIG_X86_LOCAL_APIC - sum += per_cpu(irq_stat, cpu).apic_timer_irqs; -#endif -#ifdef CONFIG_SMP - sum += per_cpu(irq_stat, cpu).irq_resched_count; - sum += per_cpu(irq_stat, cpu).irq_call_count; - sum += per_cpu(irq_stat, cpu).irq_tlb_count; -#endif -#ifdef CONFIG_X86_MCE - sum += per_cpu(irq_stat, cpu).irq_thermal_count; -#endif -#ifdef CONFIG_X86_LOCAL_APIC - sum += per_cpu(irq_stat, cpu).irq_spurious_count; -#endif - return sum; -} - -u64 arch_irq_stat(void) -{ - u64 sum = atomic_read(&irq_err_count); - -#ifdef CONFIG_X86_IO_APIC - sum += atomic_read(&irq_mis_count); -#endif - return sum; -} - #ifdef CONFIG_HOTPLUG_CPU #include diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 4f374294f29..39ef7feb9ea 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -18,8 +18,6 @@ #include #include -atomic_t irq_err_count; - /* * 'what should we do if we get a hw irq event on an illegal vector'. * each architecture has to answer this themselves. @@ -64,136 +62,6 @@ static inline void stack_overflow_check(struct pt_regs *regs) } #endif -/* - * Generic, controller-independent functions: - */ - -static int show_other_interrupts(struct seq_file *p) -{ - int j; - - seq_printf(p, "NMI: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count); - seq_printf(p, " Non-maskable interrupts\n"); - seq_printf(p, "LOC: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs); - seq_printf(p, " Local timer interrupts\n"); -#ifdef CONFIG_SMP - seq_printf(p, "RES: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_resched_count); - seq_printf(p, " Rescheduling interrupts\n"); - seq_printf(p, "CAL: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count); - seq_printf(p, " Function call interrupts\n"); - seq_printf(p, "TLB: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count); - seq_printf(p, " TLB shootdowns\n"); -#endif -#ifdef CONFIG_X86_MCE - seq_printf(p, "TRM: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count); - seq_printf(p, " Thermal event interrupts\n"); - seq_printf(p, "THR: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count); - seq_printf(p, " Threshold APIC interrupts\n"); -#endif - seq_printf(p, "SPU: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count); - seq_printf(p, " Spurious interrupts\n"); - seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); - - return 0; -} - -int show_interrupts(struct seq_file *p, void *v) -{ - unsigned long flags, any_count = 0; - int i = *(loff_t *) v, j; - struct irqaction *action; - struct irq_desc *desc; - - if (i > nr_irqs) - return 0; - - if (i == nr_irqs) - return show_other_interrupts(p); - - /* print header */ - if (i == 0) { - seq_printf(p, " "); - for_each_online_cpu(j) - seq_printf(p, "CPU%-8d",j); - seq_putc(p, '\n'); - } - - desc = irq_to_desc(i); - spin_lock_irqsave(&desc->lock, flags); -#ifndef CONFIG_SMP - any_count = kstat_irqs(i); -#else - for_each_online_cpu(j) - any_count |= kstat_irqs_cpu(i, j); -#endif - action = desc->action; - if (!action && !any_count) - goto out; - - seq_printf(p, "%3d: ", i); -#ifndef CONFIG_SMP - seq_printf(p, "%10u ", kstat_irqs(i)); -#else - for_each_online_cpu(j) - seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); -#endif - seq_printf(p, " %8s", desc->chip->name); - seq_printf(p, "-%-8s", desc->name); - - if (action) { - seq_printf(p, " %s", action->name); - while ((action = action->next) != NULL) - seq_printf(p, ", %s", action->name); - } - - seq_putc(p, '\n'); -out: - spin_unlock_irqrestore(&desc->lock, flags); - return 0; -} - -/* - * /proc/stat helpers - */ -u64 arch_irq_stat_cpu(unsigned int cpu) -{ - u64 sum = cpu_pda(cpu)->__nmi_count; - - sum += cpu_pda(cpu)->apic_timer_irqs; -#ifdef CONFIG_SMP - sum += cpu_pda(cpu)->irq_resched_count; - sum += cpu_pda(cpu)->irq_call_count; - sum += cpu_pda(cpu)->irq_tlb_count; -#endif -#ifdef CONFIG_X86_MCE - sum += cpu_pda(cpu)->irq_thermal_count; - sum += cpu_pda(cpu)->irq_threshold_count; -#endif - sum += cpu_pda(cpu)->irq_spurious_count; - return sum; -} - -u64 arch_irq_stat(void) -{ - return atomic_read(&irq_err_count); -} - /* * do_IRQ handles all normal device IRQ's (the special * SMP cross-CPU interrupts have their own specific -- cgit v1.2.3-70-g09d2 From 249f6d9eab372790579ada8991bba3384c204e06 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 16 Oct 2008 12:18:50 +0200 Subject: x86: move ack_bad_irq() to irq.c Share more duplicated code. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/irq.c | 23 +++++++++++++++++++++++ arch/x86/kernel/irq_32.c | 23 ----------------------- arch/x86/kernel/irq_64.c | 20 -------------------- 3 files changed, 23 insertions(+), 43 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 3b912849897..ccf6c503fc3 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -12,6 +12,29 @@ atomic_t irq_err_count; +/* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves. + */ +void ack_bad_irq(unsigned int irq) +{ + printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq); + +#ifdef CONFIG_X86_LOCAL_APIC + /* + * Currently unexpected vectors happen only on SMP and APIC. + * We _must_ ack these because every local APIC has only N + * irq slots per priority level, and a 'hanging, unacked' IRQ + * holds up an irq slot - in excessive cases (when multiple + * unexpected vectors occur) that might lock up the APIC + * completely. + * But only ack when the APIC is enabled -AK + */ + if (cpu_has_apic) + ack_APIC_irq(); +#endif +} + #ifdef CONFIG_X86_32 # define irq_stats(x) (&per_cpu(irq_stat,x)) #else diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 6d9bf3936c7..a51382672de 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -25,29 +25,6 @@ EXPORT_PER_CPU_SYMBOL(irq_stat); DEFINE_PER_CPU(struct pt_regs *, irq_regs); EXPORT_PER_CPU_SYMBOL(irq_regs); -/* - * 'what should we do if we get a hw irq event on an illegal vector'. - * each architecture has to answer this themselves. - */ -void ack_bad_irq(unsigned int irq) -{ - printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq); - -#ifdef CONFIG_X86_LOCAL_APIC - /* - * Currently unexpected vectors happen only on SMP and APIC. - * We _must_ ack these because every local APIC has only N - * irq slots per priority level, and a 'hanging, unacked' IRQ - * holds up an irq slot - in excessive cases (when multiple - * unexpected vectors occur) that might lock up the APIC - * completely. - * But only ack when the APIC is enabled -AK - */ - if (cpu_has_apic) - ack_APIC_irq(); -#endif -} - #ifdef CONFIG_DEBUG_STACKOVERFLOW /* Debugging check for stack overflow: is there less than 1KB free? */ static int check_stack_overflow(void) diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 39ef7feb9ea..60eb84eb77a 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -18,26 +18,6 @@ #include #include -/* - * 'what should we do if we get a hw irq event on an illegal vector'. - * each architecture has to answer this themselves. - */ -void ack_bad_irq(unsigned int irq) -{ - printk(KERN_WARNING "unexpected IRQ trap at vector %02x\n", irq); - /* - * Currently unexpected vectors happen only on SMP and APIC. - * We _must_ ack these because every local APIC has only N - * irq slots per priority level, and a 'hanging, unacked' IRQ - * holds up an irq slot - in excessive cases (when multiple - * unexpected vectors occur) that might lock up the APIC - * completely. - * But don't ack when the APIC is disabled. -AK - */ - if (!disable_apic) - ack_APIC_irq(); -} - #ifdef CONFIG_DEBUG_STACKOVERFLOW /* * Probabilistic stack overflow check: -- cgit v1.2.3-70-g09d2 From 10e0298686be6249b0665465737a6b83446c4d35 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 16 Oct 2008 17:04:22 +0200 Subject: io_apic: make irq_mis_count available on 64-bit too Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 6f80dc2f137..b764d7429c6 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -2277,9 +2277,7 @@ static void ack_apic_edge(unsigned int irq) ack_APIC_irq(); } -#ifdef CONFIG_X86_32 atomic_t irq_mis_count; -#endif static void ack_apic_level(unsigned int irq) { -- cgit v1.2.3-70-g09d2 From 0f019cc477b494dfc472f2a98eb64d02d4937741 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 16 Oct 2008 15:01:40 +0200 Subject: oprofile: fixing whitespaces in arch/x86/oprofile/* Signed-off-by: Robert Richter --- arch/x86/oprofile/backtrace.c | 3 +-- arch/x86/oprofile/op_counter.h | 18 +++++++++--------- 2 files changed, 10 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index 36e324139f7..04df67f8a7b 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -52,8 +52,7 @@ struct frame_head { unsigned long ret; } __attribute__((packed)); -static struct frame_head * -dump_user_backtrace(struct frame_head *head) +static struct frame_head *dump_user_backtrace(struct frame_head *head) { struct frame_head bufhead[2]; diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h index 2880b15c467..91b6a116165 100644 --- a/arch/x86/oprofile/op_counter.h +++ b/arch/x86/oprofile/op_counter.h @@ -6,22 +6,22 @@ * * @author John Levon */ - + #ifndef OP_COUNTER_H #define OP_COUNTER_H - + #define OP_MAX_COUNTER 8 - + /* Per-perfctr configuration as set via * oprofilefs. */ struct op_counter_config { - unsigned long count; - unsigned long enabled; - unsigned long event; - unsigned long kernel; - unsigned long user; - unsigned long unit_mask; + unsigned long count; + unsigned long enabled; + unsigned long event; + unsigned long kernel; + unsigned long user; + unsigned long unit_mask; }; extern struct op_counter_config counter_config[]; -- cgit v1.2.3-70-g09d2 From ae87221d3ce49d9de1e43756da834fd0bf05a2ad Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 24 Aug 2007 16:11:54 -0700 Subject: sysfs: crash debugging Print the name of the last-accessed sysfs file when we oops, to help track down oopses which occur in sysfs store/read handlers. Because these oopses tend to not leave any trace of the offending code in the stack traces. Cc: Kay Sievers Cc: Mathieu Desnoyers Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/dumpstack_32.c | 2 ++ arch/x86/kernel/dumpstack_64.c | 2 ++ fs/sysfs/file.c | 13 +++++++++++++ include/linux/sysfs.h | 6 ++++++ 4 files changed, 23 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 201ee359a1a..1a78180f08d 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -13,6 +13,7 @@ #include #include #include +#include #include @@ -343,6 +344,7 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) printk("DEBUG_PAGEALLOC"); #endif printk("\n"); + sysfs_printk_last_file(); if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) return 1; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 086cc8118e3..96a5db7da8a 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -13,6 +13,7 @@ #include #include #include +#include #include @@ -489,6 +490,7 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) printk("DEBUG_PAGEALLOC"); #endif printk("\n"); + sysfs_printk_last_file(); if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) return 1; diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index c9e4e5091da..ce8339c70a4 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -19,10 +19,18 @@ #include #include #include +#include #include #include "sysfs.h" +/* used in crash dumps to help with debugging */ +static char last_sysfs_file[PATH_MAX]; +void sysfs_printk_last_file(void) +{ + printk(KERN_EMERG "last sysfs file: %s\n", last_sysfs_file); +} + /* * There's one sysfs_buffer for each open file and one * sysfs_open_dirent for each sysfs_dirent with one or more open @@ -328,6 +336,11 @@ static int sysfs_open_file(struct inode *inode, struct file *file) struct sysfs_buffer *buffer; struct sysfs_ops *ops; int error = -EACCES; + char *p; + + p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file)); + if (p) + memmove(last_sysfs_file, p, strlen(p) + 1); /* need attr_sd for attr and ops, its parent for kobj */ if (!sysfs_get_active_two(attr_sd)) diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index 37fa24152bd..8ec406afb3e 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -119,6 +119,8 @@ void sysfs_remove_file_from_group(struct kobject *kobj, void sysfs_notify(struct kobject *kobj, char *dir, char *attr); +void sysfs_printk_last_file(void); + extern int __must_check sysfs_init(void); #else /* CONFIG_SYSFS */ @@ -231,6 +233,10 @@ static inline int __must_check sysfs_init(void) return 0; } +static inline void sysfs_printk_last_file(void) +{ +} + #endif /* CONFIG_SYSFS */ #endif /* _SYSFS_H_ */ -- cgit v1.2.3-70-g09d2 From a9b12619f7b6f19c871437ec24a088787a04b1de Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 21 Jul 2008 20:03:34 -0700 Subject: device create: misc: convert device_create_drvdata to device_create Now that device_create() has been audited, rename things back to the original call to be sane. Signed-off-by: Greg Kroah-Hartman --- arch/mips/kernel/rtlx.c | 4 ++-- arch/mips/sibyte/common/sb_tbprof.c | 3 +-- arch/x86/kernel/cpuid.c | 4 ++-- arch/x86/kernel/msr.c | 4 ++-- drivers/dca/dca-sysfs.c | 8 +++----- drivers/hid/hidraw.c | 5 ++--- drivers/hwmon/hwmon.c | 4 ++-- drivers/i2c/i2c-dev.c | 6 +++--- drivers/isdn/capi/capi.c | 3 +-- drivers/leds/led-class.c | 4 ++-- drivers/macintosh/adb.c | 3 +-- drivers/media/dvb/dvb-core/dvbdev.c | 2 +- drivers/misc/phantom.c | 6 +++--- drivers/mtd/mtdchar.c | 10 ++++------ drivers/power/power_supply_core.c | 4 ++-- drivers/spi/spidev.c | 7 +++---- drivers/uio/uio.c | 6 +++--- fs/coda/psdev.c | 5 ++--- 18 files changed, 39 insertions(+), 49 deletions(-) (limited to 'arch/x86') diff --git a/arch/mips/kernel/rtlx.c b/arch/mips/kernel/rtlx.c index dfd868b6836..4ce93aa7b37 100644 --- a/arch/mips/kernel/rtlx.c +++ b/arch/mips/kernel/rtlx.c @@ -522,8 +522,8 @@ static int __init rtlx_module_init(void) atomic_set(&channel_wqs[i].in_open, 0); mutex_init(&channel_wqs[i].mutex); - dev = device_create_drvdata(mt_class, NULL, MKDEV(major, i), - NULL, "%s%d", module_name, i); + dev = device_create(mt_class, NULL, MKDEV(major, i), NULL, + "%s%d", module_name, i); if (IS_ERR(dev)) { err = PTR_ERR(dev); goto out_chrdev; diff --git a/arch/mips/sibyte/common/sb_tbprof.c b/arch/mips/sibyte/common/sb_tbprof.c index 66e3e3fb311..637a194e5cd 100644 --- a/arch/mips/sibyte/common/sb_tbprof.c +++ b/arch/mips/sibyte/common/sb_tbprof.c @@ -576,8 +576,7 @@ static int __init sbprof_tb_init(void) tb_class = tbc; - dev = device_create_drvdata(tbc, NULL, MKDEV(SBPROF_TB_MAJOR, 0), - NULL, "tb"); + dev = device_create(tbc, NULL, MKDEV(SBPROF_TB_MAJOR, 0), NULL, "tb"); if (IS_ERR(dev)) { err = PTR_ERR(dev); goto out_class; diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 6a44d646599..72cefd1e649 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -147,8 +147,8 @@ static __cpuinit int cpuid_device_create(int cpu) { struct device *dev; - dev = device_create_drvdata(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), - NULL, "cpu%d", cpu); + dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL, + "cpu%d", cpu); return IS_ERR(dev) ? PTR_ERR(dev) : 0; } diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 2e2af5d1819..82a7c7ed6d4 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -163,8 +163,8 @@ static int __cpuinit msr_device_create(int cpu) { struct device *dev; - dev = device_create_drvdata(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), - NULL, "msr%d", cpu); + dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL, + "msr%d", cpu); return IS_ERR(dev) ? PTR_ERR(dev) : 0; } diff --git a/drivers/dca/dca-sysfs.c b/drivers/dca/dca-sysfs.c index 7af4b403bd2..bb538b9690e 100644 --- a/drivers/dca/dca-sysfs.c +++ b/drivers/dca/dca-sysfs.c @@ -15,9 +15,8 @@ int dca_sysfs_add_req(struct dca_provider *dca, struct device *dev, int slot) struct device *cd; static int req_count; - cd = device_create_drvdata(dca_class, dca->cd, - MKDEV(0, slot + 1), NULL, - "requester%d", req_count++); + cd = device_create(dca_class, dca->cd, MKDEV(0, slot + 1), NULL, + "requester%d", req_count++); if (IS_ERR(cd)) return PTR_ERR(cd); return 0; @@ -48,8 +47,7 @@ idr_try_again: return err; } - cd = device_create_drvdata(dca_class, dev, MKDEV(0, 0), NULL, - "dca%d", dca->id); + cd = device_create(dca_class, dev, MKDEV(0, 0), NULL, "dca%d", dca->id); if (IS_ERR(cd)) { spin_lock(&dca_idr_lock); idr_remove(&dca_idr, dca->id); diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c index 497e0d1dd3c..af3edb98df4 100644 --- a/drivers/hid/hidraw.c +++ b/drivers/hid/hidraw.c @@ -326,9 +326,8 @@ int hidraw_connect(struct hid_device *hid) goto out; } - dev->dev = device_create_drvdata(hidraw_class, NULL, - MKDEV(hidraw_major, minor), NULL, - "%s%d", "hidraw", minor); + dev->dev = device_create(hidraw_class, NULL, MKDEV(hidraw_major, minor), + NULL, "%s%d", "hidraw", minor); if (IS_ERR(dev->dev)) { spin_lock(&minors_lock); diff --git a/drivers/hwmon/hwmon.c b/drivers/hwmon/hwmon.c index 7321a88a511..076a59cdabe 100644 --- a/drivers/hwmon/hwmon.c +++ b/drivers/hwmon/hwmon.c @@ -55,8 +55,8 @@ again: return ERR_PTR(err); id = id & MAX_ID_MASK; - hwdev = device_create_drvdata(hwmon_class, dev, MKDEV(0, 0), NULL, - HWMON_ID_FORMAT, id); + hwdev = device_create(hwmon_class, dev, MKDEV(0, 0), NULL, + HWMON_ID_FORMAT, id); if (IS_ERR(hwdev)) { spin_lock(&idr_lock); diff --git a/drivers/i2c/i2c-dev.c b/drivers/i2c/i2c-dev.c index 307d976c9b6..c171988a9f5 100644 --- a/drivers/i2c/i2c-dev.c +++ b/drivers/i2c/i2c-dev.c @@ -521,9 +521,9 @@ static int i2cdev_attach_adapter(struct i2c_adapter *adap) return PTR_ERR(i2c_dev); /* register this i2c device with the driver core */ - i2c_dev->dev = device_create_drvdata(i2c_dev_class, &adap->dev, - MKDEV(I2C_MAJOR, adap->nr), - NULL, "i2c-%d", adap->nr); + i2c_dev->dev = device_create(i2c_dev_class, &adap->dev, + MKDEV(I2C_MAJOR, adap->nr), NULL, + "i2c-%d", adap->nr); if (IS_ERR(i2c_dev->dev)) { res = PTR_ERR(i2c_dev->dev); goto error; diff --git a/drivers/isdn/capi/capi.c b/drivers/isdn/capi/capi.c index 798d7f3e42e..1b5bf87c4cf 100644 --- a/drivers/isdn/capi/capi.c +++ b/drivers/isdn/capi/capi.c @@ -1553,8 +1553,7 @@ static int __init capi_init(void) return PTR_ERR(capi_class); } - device_create_drvdata(capi_class, NULL, MKDEV(capi_major, 0), NULL, - "capi"); + device_create(capi_class, NULL, MKDEV(capi_major, 0), NULL, "capi"); #ifdef CONFIG_ISDN_CAPI_MIDDLEWARE if (capinc_tty_init() < 0) { diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c index 559a40861c3..ee74ee7b2ac 100644 --- a/drivers/leds/led-class.c +++ b/drivers/leds/led-class.c @@ -103,8 +103,8 @@ int led_classdev_register(struct device *parent, struct led_classdev *led_cdev) { int rc; - led_cdev->dev = device_create_drvdata(leds_class, parent, 0, led_cdev, - "%s", led_cdev->name); + led_cdev->dev = device_create(leds_class, parent, 0, led_cdev, + "%s", led_cdev->name); if (IS_ERR(led_cdev->dev)) return PTR_ERR(led_cdev->dev); diff --git a/drivers/macintosh/adb.c b/drivers/macintosh/adb.c index cae52485208..23741cec45e 100644 --- a/drivers/macintosh/adb.c +++ b/drivers/macintosh/adb.c @@ -862,8 +862,7 @@ adbdev_init(void) adb_dev_class = class_create(THIS_MODULE, "adb"); if (IS_ERR(adb_dev_class)) return; - device_create_drvdata(adb_dev_class, NULL, MKDEV(ADB_MAJOR, 0), NULL, - "adb"); + device_create(adb_dev_class, NULL, MKDEV(ADB_MAJOR, 0), NULL, "adb"); platform_device_register(&adb_pfdev); platform_driver_probe(&adb_pfdrv, adb_dummy_probe); diff --git a/drivers/media/dvb/dvb-core/dvbdev.c b/drivers/media/dvb/dvb-core/dvbdev.c index e7132770a3b..665776d72a4 100644 --- a/drivers/media/dvb/dvb-core/dvbdev.c +++ b/drivers/media/dvb/dvb-core/dvbdev.c @@ -233,7 +233,7 @@ int dvb_register_device(struct dvb_adapter *adap, struct dvb_device **pdvbdev, mutex_unlock(&dvbdev_register_lock); - clsdev = device_create_drvdata(dvb_class, adap->device, + clsdev = device_create(dvb_class, adap->device, MKDEV(DVB_MAJOR, nums2minor(adap->num, type, id)), NULL, "dvb%d.%s%d", adap->num, dnames[type], id); if (IS_ERR(clsdev)) { diff --git a/drivers/misc/phantom.c b/drivers/misc/phantom.c index daf585689ce..abdebe34738 100644 --- a/drivers/misc/phantom.c +++ b/drivers/misc/phantom.c @@ -399,9 +399,9 @@ static int __devinit phantom_probe(struct pci_dev *pdev, goto err_irq; } - if (IS_ERR(device_create_drvdata(phantom_class, &pdev->dev, - MKDEV(phantom_major, minor), - NULL, "phantom%u", minor))) + if (IS_ERR(device_create(phantom_class, &pdev->dev, + MKDEV(phantom_major, minor), NULL, + "phantom%u", minor))) dev_err(&pdev->dev, "can't create device\n"); pci_set_drvdata(pdev, pht); diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c index e00d424e657..1c74762dec8 100644 --- a/drivers/mtd/mtdchar.c +++ b/drivers/mtd/mtdchar.c @@ -26,13 +26,11 @@ static void mtd_notify_add(struct mtd_info* mtd) if (!mtd) return; - device_create_drvdata(mtd_class, NULL, - MKDEV(MTD_CHAR_MAJOR, mtd->index*2), - NULL, "mtd%d", mtd->index); + device_create(mtd_class, NULL, MKDEV(MTD_CHAR_MAJOR, mtd->index*2), + NULL, "mtd%d", mtd->index); - device_create_drvdata(mtd_class, NULL, - MKDEV(MTD_CHAR_MAJOR, mtd->index*2+1), - NULL, "mtd%dro", mtd->index); + device_create(mtd_class, NULL, MKDEV(MTD_CHAR_MAJOR, mtd->index*2+1), + NULL, "mtd%dro", mtd->index); } static void mtd_notify_remove(struct mtd_info* mtd) diff --git a/drivers/power/power_supply_core.c b/drivers/power/power_supply_core.c index cb1ccb47292..3007695f90c 100644 --- a/drivers/power/power_supply_core.c +++ b/drivers/power/power_supply_core.c @@ -91,8 +91,8 @@ int power_supply_register(struct device *parent, struct power_supply *psy) { int rc = 0; - psy->dev = device_create_drvdata(power_supply_class, parent, 0, - psy, "%s", psy->name); + psy->dev = device_create(power_supply_class, parent, 0, psy, + "%s", psy->name); if (IS_ERR(psy->dev)) { rc = PTR_ERR(psy->dev); goto dev_create_failed; diff --git a/drivers/spi/spidev.c b/drivers/spi/spidev.c index e5e0cfed5e3..89a43755a45 100644 --- a/drivers/spi/spidev.c +++ b/drivers/spi/spidev.c @@ -583,10 +583,9 @@ static int spidev_probe(struct spi_device *spi) struct device *dev; spidev->devt = MKDEV(SPIDEV_MAJOR, minor); - dev = device_create_drvdata(spidev_class, &spi->dev, - spidev->devt, spidev, - "spidev%d.%d", - spi->master->bus_num, spi->chip_select); + dev = device_create(spidev_class, &spi->dev, spidev->devt, + spidev, "spidev%d.%d", + spi->master->bus_num, spi->chip_select); status = IS_ERR(dev) ? PTR_ERR(dev) : 0; } else { dev_dbg(&spi->dev, "no minor number available!\n"); diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index 3a6934bf713..9ac22c7c385 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c @@ -682,9 +682,9 @@ int __uio_register_device(struct module *owner, if (ret) goto err_get_minor; - idev->dev = device_create_drvdata(uio_class->class, parent, - MKDEV(uio_major, idev->minor), idev, - "uio%d", idev->minor); + idev->dev = device_create(uio_class->class, parent, + MKDEV(uio_major, idev->minor), idev, + "uio%d", idev->minor); if (IS_ERR(idev->dev)) { printk(KERN_ERR "UIO: device register failed\n"); ret = PTR_ERR(idev->dev); diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index 0d9b80ec689..cfd29da714d 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -362,9 +362,8 @@ static int init_coda_psdev(void) goto out_chrdev; } for (i = 0; i < MAX_CODADEVS; i++) - device_create_drvdata(coda_psdev_class, NULL, - MKDEV(CODA_PSDEV_MAJOR, i), - NULL, "cfs%d", i); + device_create(coda_psdev_class, NULL, + MKDEV(CODA_PSDEV_MAJOR, i), NULL, "cfs%d", i); coda_sysctl_init(); goto out; -- cgit v1.2.3-70-g09d2 From 80a914dc05683ecfc98f9e1887fd6564846ffbec Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Wed, 15 Oct 2008 22:01:25 -0700 Subject: misc: replace __FUNCTION__ with __func__ __FUNCTION__ is gcc-specific, use __func__ Signed-off-by: Harvey Harrison Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/m32r/kernel/process.c | 2 +- arch/x86/kvm/x86.c | 2 +- drivers/gpu/drm/i915/i915_dma.c | 4 ++-- drivers/input/joystick/xpad.c | 4 ++-- drivers/s390/net/ctcm_mpc.c | 2 +- mm/bootmem.c | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c index 5be4faaf5b1..7103d91e1a2 100644 --- a/arch/m32r/kernel/process.c +++ b/arch/m32r/kernel/process.c @@ -11,7 +11,7 @@ #undef DEBUG_PROCESS #ifdef DEBUG_PROCESS #define DPRINTK(fmt, args...) printk("%s:%d:%s: " fmt, __FILE__, __LINE__, \ - __FUNCTION__, ##args) + __func__, ##args) #else #define DPRINTK(fmt, args...) #endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0d682fc6aeb..19afbb644c7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -564,7 +564,7 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info * hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", - __FUNCTION__, tsc_khz, hv_clock->tsc_shift, + __func__, tsc_khz, hv_clock->tsc_shift, hv_clock->tsc_to_system_mul); } diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c index 88974342933..9ac4720e647 100644 --- a/drivers/gpu/drm/i915/i915_dma.c +++ b/drivers/gpu/drm/i915/i915_dma.c @@ -517,7 +517,7 @@ static int i915_dispatch_flip(struct drm_device * dev) RING_LOCALS; DRM_DEBUG("%s: page=%d pfCurrentPage=%d\n", - __FUNCTION__, + __func__, dev_priv->current_page, dev_priv->sarea_priv->pf_current_page); @@ -642,7 +642,7 @@ static int i915_cmdbuffer(struct drm_device *dev, void *data, static int i915_flip_bufs(struct drm_device *dev, void *data, struct drm_file *file_priv) { - DRM_DEBUG("%s\n", __FUNCTION__); + DRM_DEBUG("%s\n", __func__); LOCK_TEST_WITH_RETURN(dev, file_priv); diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index 6791be81eb2..839d1c9622f 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -455,10 +455,10 @@ static void xpad_bulk_out(struct urb *urb) case -ENOENT: case -ESHUTDOWN: /* this urb is terminated, clean up */ - dbg("%s - urb shutting down with status: %d", __FUNCTION__, urb->status); + dbg("%s - urb shutting down with status: %d", __func__, urb->status); break; default: - dbg("%s - nonzero urb status received: %d", __FUNCTION__, urb->status); + dbg("%s - nonzero urb status received: %d", __func__, urb->status); } } diff --git a/drivers/s390/net/ctcm_mpc.c b/drivers/s390/net/ctcm_mpc.c index cbe470493bf..19f5d5ed85e 100644 --- a/drivers/s390/net/ctcm_mpc.c +++ b/drivers/s390/net/ctcm_mpc.c @@ -1673,7 +1673,7 @@ static int mpc_validate_xid(struct mpcg_info *mpcginfo) done: if (rc) { - ctcm_pr_info("ctcmpc : %s() failed\n", __FUNCTION__); + ctcm_pr_info("ctcmpc : %s() failed\n", __func__); priv->xid->xid2_flag2 = 0x40; grp->saved_xid2->xid2_flag2 = 0x40; } diff --git a/mm/bootmem.c b/mm/bootmem.c index ad8eec6e44a..ac5a891f142 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -48,7 +48,7 @@ early_param("bootmem_debug", bootmem_debug_setup); if (unlikely(bootmem_debug)) \ printk(KERN_INFO \ "bootmem::%s " fmt, \ - __FUNCTION__, ## args); \ + __func__, ## args); \ }) static unsigned long __init bootmap_bytes(unsigned long pages) -- cgit v1.2.3-70-g09d2 From 9ba16087d9f996a93ab6f4453a52a4b24bc1f25c Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 15 Oct 2008 22:01:38 -0700 Subject: Kconfig: eliminate "def_bool n" constructs Using "def_bool n" is pointless, simply using bool here appears more appropriate. Further, retaining such options that don't have a prompt and aren't selected by anything seems also at least questionable. Signed-off-by: Jan Beulich Cc: Ingo Molnar Cc: Tony Luck Cc: Thomas Gleixner Cc: Bartlomiej Zolnierkiewicz Cc: Sam Ravnborg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 18 +++++++++--------- arch/ia64/Kconfig | 8 -------- arch/x86/Kconfig | 23 +++++------------------ drivers/ide/Kconfig | 2 +- kernel/time/Kconfig | 1 - lib/Kconfig | 4 ++-- mm/Kconfig | 4 ++-- 7 files changed, 19 insertions(+), 41 deletions(-) (limited to 'arch/x86') diff --git a/arch/Kconfig b/arch/Kconfig index 0267babe5eb..e6ab550bceb 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -28,7 +28,7 @@ config OPROFILE_IBS If unsure, say N. config HAVE_OPROFILE - def_bool n + bool config KPROBES bool "Kprobes" @@ -42,7 +42,7 @@ config KPROBES If in doubt, say "N". config HAVE_EFFICIENT_UNALIGNED_ACCESS - def_bool n + bool help Some architectures are unable to perform unaligned accesses without the use of get_unaligned/put_unaligned. Others are @@ -65,13 +65,13 @@ config KRETPROBES depends on KPROBES && HAVE_KRETPROBES config HAVE_IOREMAP_PROT - def_bool n + bool config HAVE_KPROBES - def_bool n + bool config HAVE_KRETPROBES - def_bool n + bool # # An arch should select this if it provides all these things: @@ -89,16 +89,16 @@ config HAVE_KRETPROBES # signal delivery calls tracehook_signal_handler() # config HAVE_ARCH_TRACEHOOK - def_bool n + bool config HAVE_DMA_ATTRS - def_bool n + bool config USE_GENERIC_SMP_HELPERS - def_bool n + bool config HAVE_CLK - def_bool n + bool help The calls support software clock gating and thus are a key power management tool on many systems. diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 48e496fe1e7..3b7aa38254a 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -60,14 +60,6 @@ config RWSEM_XCHGADD_ALGORITHM bool default y -config ARCH_HAS_ILOG2_U32 - bool - default n - -config ARCH_HAS_ILOG2_U64 - bool - default n - config HUGETLB_PAGE_SIZE_VARIABLE bool depends on HUGETLB_PAGE diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f65c2744d57..7ccb6e60e60 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -39,10 +39,6 @@ config ARCH_DEFCONFIG default "arch/x86/configs/i386_defconfig" if X86_32 default "arch/x86/configs/x86_64_defconfig" if X86_64 - -config GENERIC_LOCKBREAK - def_bool n - config GENERIC_TIME def_bool y @@ -95,7 +91,7 @@ config GENERIC_HWEIGHT def_bool y config GENERIC_GPIO - def_bool n + bool config ARCH_MAY_HAVE_PC_FDC def_bool y @@ -106,12 +102,6 @@ config RWSEM_GENERIC_SPINLOCK config RWSEM_XCHGADD_ALGORITHM def_bool X86_XADD -config ARCH_HAS_ILOG2_U32 - def_bool n - -config ARCH_HAS_ILOG2_U64 - def_bool n - config ARCH_HAS_CPU_IDLE_WAIT def_bool y @@ -758,9 +748,8 @@ config I8K Say N otherwise. config X86_REBOOTFIXUPS - def_bool n - prompt "Enable X86 board specific fixups for reboot" - depends on X86_32 && X86 + bool "Enable X86 board specific fixups for reboot" + depends on X86_32 ---help--- This enables chipset and/or board specific fixups to be done in order to get reboot to work correctly. This is only needed on @@ -944,8 +933,7 @@ config HIGHMEM depends on X86_32 && (HIGHMEM64G || HIGHMEM4G) config X86_PAE - def_bool n - prompt "PAE (Physical Address Extension) Support" + bool "PAE (Physical Address Extension) Support" depends on X86_32 && !HIGHMEM4G select RESOURCES_64BIT help @@ -1238,8 +1226,7 @@ config X86_PAT If unsure, say Y. config EFI - def_bool n - prompt "EFI runtime service support" + bool "EFI runtime service support" depends on ACPI ---help--- This enables the kernel to use EFI runtime services that are diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig index 6c6dd2faced..74a369a6116 100644 --- a/drivers/ide/Kconfig +++ b/drivers/ide/Kconfig @@ -4,7 +4,7 @@ # Select HAVE_IDE if IDE is supported config HAVE_IDE - def_bool n + bool menuconfig IDE tristate "ATA/ATAPI/MFM/RLL support" diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 8d53106a0a9..95ed42951e0 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -3,7 +3,6 @@ # config TICK_ONESHOT bool - default n config NO_HZ bool "Tickless System (Dynamic Ticks)" diff --git a/lib/Kconfig b/lib/Kconfig index c7ad7a5b353..85cf7ea978a 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -8,10 +8,10 @@ config BITREVERSE tristate config GENERIC_FIND_FIRST_BIT - def_bool n + bool config GENERIC_FIND_NEXT_BIT - def_bool n + bool config CRC_CCITT tristate "CRC-CCITT functions" diff --git a/mm/Kconfig b/mm/Kconfig index 0bd9c2dbb2a..5585f129359 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -101,7 +101,7 @@ config HAVE_MEMORY_PRESENT # with gcc 3.4 and later. # config SPARSEMEM_STATIC - def_bool n + bool # # Architecture platforms which require a two level mem_section in SPARSEMEM @@ -113,7 +113,7 @@ config SPARSEMEM_EXTREME depends on SPARSEMEM && !SPARSEMEM_STATIC config SPARSEMEM_VMEMMAP_ENABLE - def_bool n + bool config SPARSEMEM_VMEMMAP bool "Sparse Memory virtual memmap" -- cgit v1.2.3-70-g09d2 From 25ddbb18aae33ad255eb9f35aacebe3af01e1e9c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 15 Oct 2008 22:01:41 -0700 Subject: Make the taint flags reliable It's somewhat unlikely that it happens, but right now a race window between interrupts or machine checks or oopses could corrupt the tainted bitmap because it is modified in a non atomic fashion. Convert the taint variable to an unsigned long and use only atomic bit operations on it. Unfortunately this means the intvec sysctl functions cannot be used on it anymore. It turned out the taint sysctl handler could actually be simplified a bit (since it only increases capabilities) so this patch actually removes code. [akpm@linux-foundation.org: remove unneeded include] Signed-off-by: Andi Kleen Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/smpboot.c | 14 +++++----- include/linux/kernel.h | 25 +++++++++--------- kernel/module.c | 12 ++++----- kernel/panic.c | 63 ++++++++++++++++++++++++++++++++------------ kernel/softlockup.c | 2 +- kernel/sysctl.c | 67 ++++++++++++++++++++--------------------------- 6 files changed, 101 insertions(+), 82 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8c3aca7cb34..7ed9e070a6e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -282,6 +282,8 @@ static void __cpuinit smp_callin(void) cpu_set(cpuid, cpu_callin_map); } +static int __cpuinitdata unsafe_smp; + /* * Activate a secondary processor. */ @@ -397,7 +399,7 @@ static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c) goto valid_k7; /* If we get here, not a certified SMP capable AMD system. */ - add_taint(TAINT_UNSAFE_SMP); + unsafe_smp = 1; } valid_k7: @@ -414,12 +416,10 @@ static void __cpuinit smp_checks(void) * Don't taint if we are running SMP kernel on a single non-MP * approved Athlon */ - if (tainted & TAINT_UNSAFE_SMP) { - if (num_online_cpus()) - printk(KERN_INFO "WARNING: This combination of AMD" - "processors is not suitable for SMP.\n"); - else - tainted &= ~TAINT_UNSAFE_SMP; + if (unsafe_smp && num_online_cpus() > 1) { + printk(KERN_INFO "WARNING: This combination of AMD" + "processors is not suitable for SMP.\n"); + add_taint(TAINT_UNSAFE_SMP); } } diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 75d81f157d2..e971c55f45a 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -235,9 +235,10 @@ extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in extern int panic_timeout; extern int panic_on_oops; extern int panic_on_unrecovered_nmi; -extern int tainted; extern const char *print_tainted(void); -extern void add_taint(unsigned); +extern void add_taint(unsigned flag); +extern int test_taint(unsigned flag); +extern unsigned long get_taint(void); extern int root_mountflags; /* Values used for system_state */ @@ -250,16 +251,16 @@ extern enum system_states { SYSTEM_SUSPEND_DISK, } system_state; -#define TAINT_PROPRIETARY_MODULE (1<<0) -#define TAINT_FORCED_MODULE (1<<1) -#define TAINT_UNSAFE_SMP (1<<2) -#define TAINT_FORCED_RMMOD (1<<3) -#define TAINT_MACHINE_CHECK (1<<4) -#define TAINT_BAD_PAGE (1<<5) -#define TAINT_USER (1<<6) -#define TAINT_DIE (1<<7) -#define TAINT_OVERRIDDEN_ACPI_TABLE (1<<8) -#define TAINT_WARN (1<<9) +#define TAINT_PROPRIETARY_MODULE 0 +#define TAINT_FORCED_MODULE 1 +#define TAINT_UNSAFE_SMP 2 +#define TAINT_FORCED_RMMOD 3 +#define TAINT_MACHINE_CHECK 4 +#define TAINT_BAD_PAGE 5 +#define TAINT_USER 6 +#define TAINT_DIE 7 +#define TAINT_OVERRIDDEN_ACPI_TABLE 8 +#define TAINT_WARN 9 extern void dump_stack(void) __cold; diff --git a/kernel/module.c b/kernel/module.c index 9db11911e04..dd9ac6ad5cb 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -100,7 +100,7 @@ static inline int strong_try_module_get(struct module *mod) static inline void add_taint_module(struct module *mod, unsigned flag) { add_taint(flag); - mod->taints |= flag; + mod->taints |= (1U << flag); } /* @@ -923,7 +923,7 @@ static const char vermagic[] = VERMAGIC_STRING; static int try_to_force_load(struct module *mod, const char *symname) { #ifdef CONFIG_MODULE_FORCE_LOAD - if (!(tainted & TAINT_FORCED_MODULE)) + if (!test_taint(TAINT_FORCED_MODULE)) printk("%s: no version for \"%s\" found: kernel tainted.\n", mod->name, symname); add_taint_module(mod, TAINT_FORCED_MODULE); @@ -1033,7 +1033,7 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs, const unsigned long *crc; ret = find_symbol(name, &owner, &crc, - !(mod->taints & TAINT_PROPRIETARY_MODULE), true); + !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); if (!IS_ERR_VALUE(ret)) { /* use_module can fail due to OOM, or module initialization or unloading */ @@ -1634,7 +1634,7 @@ static void set_license(struct module *mod, const char *license) license = "unspecified"; if (!license_is_gpl_compatible(license)) { - if (!(tainted & TAINT_PROPRIETARY_MODULE)) + if (!test_taint(TAINT_PROPRIETARY_MODULE)) printk(KERN_WARNING "%s: module license '%s' taints " "kernel.\n", mod->name, license); add_taint_module(mod, TAINT_PROPRIETARY_MODULE); @@ -2552,9 +2552,9 @@ static char *module_flags(struct module *mod, char *buf) mod->state == MODULE_STATE_GOING || mod->state == MODULE_STATE_COMING) { buf[bx++] = '('; - if (mod->taints & TAINT_PROPRIETARY_MODULE) + if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE)) buf[bx++] = 'P'; - if (mod->taints & TAINT_FORCED_MODULE) + if (mod->taints & (1 << TAINT_FORCED_MODULE)) buf[bx++] = 'F'; /* * TAINT_FORCED_RMMOD: could be added. diff --git a/kernel/panic.c b/kernel/panic.c index 12c5a0a6c89..028013f7afd 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -23,7 +23,7 @@ #include int panic_on_oops; -int tainted; +static unsigned long tainted_mask; static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); @@ -159,31 +159,60 @@ EXPORT_SYMBOL(panic); * The string is overwritten by the next call to print_taint(). */ +struct tnt { + u8 bit; + char true; + char false; +}; + +static const struct tnt tnts[] = { + { TAINT_PROPRIETARY_MODULE, 'P', 'G' }, + { TAINT_FORCED_MODULE, 'F', ' ' }, + { TAINT_UNSAFE_SMP, 'S', ' ' }, + { TAINT_FORCED_RMMOD, 'R', ' ' }, + { TAINT_MACHINE_CHECK, 'M', ' ' }, + { TAINT_BAD_PAGE, 'B', ' ' }, + { TAINT_USER, 'U', ' ' }, + { TAINT_DIE, 'D', ' ' }, + { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' }, + { TAINT_WARN, 'W', ' ' }, +}; + const char *print_tainted(void) { - static char buf[20]; - if (tainted) { - snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c%c", - tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', - tainted & TAINT_FORCED_MODULE ? 'F' : ' ', - tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', - tainted & TAINT_FORCED_RMMOD ? 'R' : ' ', - tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', - tainted & TAINT_BAD_PAGE ? 'B' : ' ', - tainted & TAINT_USER ? 'U' : ' ', - tainted & TAINT_DIE ? 'D' : ' ', - tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' ', - tainted & TAINT_WARN ? 'W' : ' '); - } - else + static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ") + 1]; + + if (tainted_mask) { + char *s; + int i; + + s = buf + sprintf(buf, "Tainted: "); + for (i = 0; i < ARRAY_SIZE(tnts); i++) { + const struct tnt *t = &tnts[i]; + *s++ = test_bit(t->bit, &tainted_mask) ? + t->true : t->false; + } + *s = 0; + } else snprintf(buf, sizeof(buf), "Not tainted"); return(buf); } +int test_taint(unsigned flag) +{ + return test_bit(flag, &tainted_mask); +} +EXPORT_SYMBOL(test_taint); + +unsigned long get_taint(void) +{ + return tainted_mask; +} + void add_taint(unsigned flag) { debug_locks = 0; /* can't trust the integrity of the kernel anymore */ - tainted |= flag; + set_bit(flag, &tainted_mask); } EXPORT_SYMBOL(add_taint); diff --git a/kernel/softlockup.c b/kernel/softlockup.c index cb838ee93a8..3953e4aed73 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -226,7 +226,7 @@ static void check_hung_uninterruptible_tasks(int this_cpu) * If the system crashed already then all bets are off, * do not report extra hung tasks: */ - if ((tainted & TAINT_DIE) || did_panic) + if (test_taint(TAINT_DIE) || did_panic) return; read_lock(&tasklist_lock); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index cfc5295f1e8..ec88fcc9a0d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -149,7 +149,7 @@ extern int max_lock_depth; #ifdef CONFIG_PROC_SYSCTL static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); -static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp, +static int proc_taint(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); #endif @@ -379,10 +379,9 @@ static struct ctl_table kern_table[] = { #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted", - .data = &tainted, - .maxlen = sizeof(int), + .maxlen = sizeof(long), .mode = 0644, - .proc_handler = &proc_dointvec_taint, + .proc_handler = &proc_taint, }, #endif #ifdef CONFIG_LATENCYTOP @@ -2228,49 +2227,39 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp, NULL,NULL); } -#define OP_SET 0 -#define OP_AND 1 -#define OP_OR 2 - -static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - int op = *(int *)data; - if (write) { - int val = *negp ? -*lvalp : *lvalp; - switch(op) { - case OP_SET: *valp = val; break; - case OP_AND: *valp &= val; break; - case OP_OR: *valp |= val; break; - } - } else { - int val = *valp; - if (val < 0) { - *negp = -1; - *lvalp = (unsigned long)-val; - } else { - *negp = 0; - *lvalp = (unsigned long)val; - } - } - return 0; -} - /* - * Taint values can only be increased + * Taint values can only be increased + * This means we can safely use a temporary. */ -static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp, +static int proc_taint(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { - int op; + struct ctl_table t; + unsigned long tmptaint = get_taint(); + int err; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; - op = OP_OR; - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, - do_proc_dointvec_bset_conv,&op); + t = *table; + t.data = &tmptaint; + err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos); + if (err < 0) + return err; + + if (write) { + /* + * Poor man's atomic or. Not worth adding a primitive + * to everyone's atomic.h for this + */ + int i; + for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { + if ((tmptaint >> i) & 1) + add_taint(i); + } + } + + return err; } struct do_proc_dointvec_minmax_conv_param { -- cgit v1.2.3-70-g09d2 From f7a5000f7a8924e9c5fad1801616601d6dc65a17 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 15 Oct 2008 22:02:05 -0700 Subject: compat: move cp_compat_stat to common code struct stat / compat_stat is the same on all architectures, so cp_compat_stat should be, too. Turns out it is, except that various architectures have slightly and some high2lowuid/high2lowgid or the direct assignment instead of the SET_UID/SET_GID that expands to the correct one anyway. This patch replaces the arch-specific cp_compat_stat implementations with a common one based on the x86-64 one. Signed-off-by: Christoph Hellwig Acked-by: David S. Miller [ sparc bits ] Acked-by: Kyle McMartin [ parisc bits ] Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/ia32/sys_ia32.c | 35 ----------------------------- arch/mips/kernel/linux32.c | 35 ----------------------------- arch/parisc/kernel/sys_parisc32.c | 47 --------------------------------------- arch/powerpc/kernel/sys_ppc32.c | 36 ------------------------------ arch/s390/kernel/compat_linux.c | 35 ----------------------------- arch/sparc64/kernel/sys_sparc32.c | 35 ----------------------------- arch/x86/ia32/sys_ia32.c | 35 ----------------------------- fs/compat.c | 39 ++++++++++++++++++++++++++++++++ include/linux/compat.h | 1 - 9 files changed, 39 insertions(+), 259 deletions(-) (limited to 'arch/x86') diff --git a/arch/ia64/ia32/sys_ia32.c b/arch/ia64/ia32/sys_ia32.c index bf196cbb379..2362a8eefb3 100644 --- a/arch/ia64/ia32/sys_ia32.c +++ b/arch/ia64/ia32/sys_ia32.c @@ -118,41 +118,6 @@ sys32_execve (char __user *name, compat_uptr_t __user *argv, compat_uptr_t __use return error; } -int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) -{ - compat_ino_t ino; - int err; - - if ((u64) stat->size > MAX_NON_LFS || - !old_valid_dev(stat->dev) || - !old_valid_dev(stat->rdev)) - return -EOVERFLOW; - - ino = stat->ino; - if (sizeof(ino) < sizeof(stat->ino) && ino != stat->ino) - return -EOVERFLOW; - - if (clear_user(ubuf, sizeof(*ubuf))) - return -EFAULT; - - err = __put_user(old_encode_dev(stat->dev), &ubuf->st_dev); - err |= __put_user(ino, &ubuf->st_ino); - err |= __put_user(stat->mode, &ubuf->st_mode); - err |= __put_user(stat->nlink, &ubuf->st_nlink); - err |= __put_user(high2lowuid(stat->uid), &ubuf->st_uid); - err |= __put_user(high2lowgid(stat->gid), &ubuf->st_gid); - err |= __put_user(old_encode_dev(stat->rdev), &ubuf->st_rdev); - err |= __put_user(stat->size, &ubuf->st_size); - err |= __put_user(stat->atime.tv_sec, &ubuf->st_atime); - err |= __put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec); - err |= __put_user(stat->mtime.tv_sec, &ubuf->st_mtime); - err |= __put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec); - err |= __put_user(stat->ctime.tv_sec, &ubuf->st_ctime); - err |= __put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec); - err |= __put_user(stat->blksize, &ubuf->st_blksize); - err |= __put_user(stat->blocks, &ubuf->st_blocks); - return err; -} #if PAGE_SHIFT > IA32_PAGE_SHIFT diff --git a/arch/mips/kernel/linux32.c b/arch/mips/kernel/linux32.c index 2fefb14414b..89223a9bff2 100644 --- a/arch/mips/kernel/linux32.c +++ b/arch/mips/kernel/linux32.c @@ -63,41 +63,6 @@ #define merge_64(r1, r2) ((((r2) & 0xffffffffUL) << 32) + ((r1) & 0xffffffffUL)) #endif -/* - * Revalidate the inode. This is required for proper NFS attribute caching. - */ - -int cp_compat_stat(struct kstat *stat, struct compat_stat __user *statbuf) -{ - struct compat_stat tmp; - - if (!new_valid_dev(stat->dev) || !new_valid_dev(stat->rdev)) - return -EOVERFLOW; - - memset(&tmp, 0, sizeof(tmp)); - tmp.st_dev = new_encode_dev(stat->dev); - tmp.st_ino = stat->ino; - if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) - return -EOVERFLOW; - tmp.st_mode = stat->mode; - tmp.st_nlink = stat->nlink; - SET_UID(tmp.st_uid, stat->uid); - SET_GID(tmp.st_gid, stat->gid); - tmp.st_rdev = new_encode_dev(stat->rdev); - tmp.st_size = stat->size; - tmp.st_atime = stat->atime.tv_sec; - tmp.st_mtime = stat->mtime.tv_sec; - tmp.st_ctime = stat->ctime.tv_sec; -#ifdef STAT_HAVE_NSEC - tmp.st_atime_nsec = stat->atime.tv_nsec; - tmp.st_mtime_nsec = stat->mtime.tv_nsec; - tmp.st_ctime_nsec = stat->ctime.tv_nsec; -#endif - tmp.st_blocks = stat->blocks; - tmp.st_blksize = stat->blksize; - return copy_to_user(statbuf, &tmp, sizeof(tmp)) ? -EFAULT : 0; -} - asmlinkage unsigned long sys32_mmap2(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) diff --git a/arch/parisc/kernel/sys_parisc32.c b/arch/parisc/kernel/sys_parisc32.c index 71efd6a28e2..2c3af17e049 100644 --- a/arch/parisc/kernel/sys_parisc32.c +++ b/arch/parisc/kernel/sys_parisc32.c @@ -237,53 +237,6 @@ int sys32_settimeofday(struct compat_timeval __user *tv, struct timezone __user return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); } -int cp_compat_stat(struct kstat *stat, struct compat_stat __user *statbuf) -{ - compat_ino_t ino; - int err; - - if (stat->size > MAX_NON_LFS || !new_valid_dev(stat->dev) || - !new_valid_dev(stat->rdev)) - return -EOVERFLOW; - - ino = stat->ino; - if (sizeof(ino) < sizeof(stat->ino) && ino != stat->ino) - return -EOVERFLOW; - - err = put_user(new_encode_dev(stat->dev), &statbuf->st_dev); - err |= put_user(ino, &statbuf->st_ino); - err |= put_user(stat->mode, &statbuf->st_mode); - err |= put_user(stat->nlink, &statbuf->st_nlink); - err |= put_user(0, &statbuf->st_reserved1); - err |= put_user(0, &statbuf->st_reserved2); - err |= put_user(new_encode_dev(stat->rdev), &statbuf->st_rdev); - err |= put_user(stat->size, &statbuf->st_size); - err |= put_user(stat->atime.tv_sec, &statbuf->st_atime); - err |= put_user(stat->atime.tv_nsec, &statbuf->st_atime_nsec); - err |= put_user(stat->mtime.tv_sec, &statbuf->st_mtime); - err |= put_user(stat->mtime.tv_nsec, &statbuf->st_mtime_nsec); - err |= put_user(stat->ctime.tv_sec, &statbuf->st_ctime); - err |= put_user(stat->ctime.tv_nsec, &statbuf->st_ctime_nsec); - err |= put_user(stat->blksize, &statbuf->st_blksize); - err |= put_user(stat->blocks, &statbuf->st_blocks); - err |= put_user(0, &statbuf->__unused1); - err |= put_user(0, &statbuf->__unused2); - err |= put_user(0, &statbuf->__unused3); - err |= put_user(0, &statbuf->__unused4); - err |= put_user(0, &statbuf->__unused5); - err |= put_user(0, &statbuf->st_fstype); /* not avail */ - err |= put_user(0, &statbuf->st_realdev); /* not avail */ - err |= put_user(0, &statbuf->st_basemode); /* not avail */ - err |= put_user(0, &statbuf->st_spareshort); - err |= put_user(stat->uid, &statbuf->st_uid); - err |= put_user(stat->gid, &statbuf->st_gid); - err |= put_user(0, &statbuf->st_spare4[0]); - err |= put_user(0, &statbuf->st_spare4[1]); - err |= put_user(0, &statbuf->st_spare4[2]); - - return err; -} - /*** copied from mips64 ***/ /* * Ooo, nasty. We need here to frob 32-bit unsigned longs to diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c index ff7de7b0797..d00599bb24a 100644 --- a/arch/powerpc/kernel/sys_ppc32.c +++ b/arch/powerpc/kernel/sys_ppc32.c @@ -61,42 +61,6 @@ asmlinkage long ppc32_select(u32 n, compat_ulong_t __user *inp, return compat_sys_select((int)n, inp, outp, exp, compat_ptr(tvp_x)); } -int cp_compat_stat(struct kstat *stat, struct compat_stat __user *statbuf) -{ - compat_ino_t ino; - long err; - - if (stat->size > MAX_NON_LFS || !new_valid_dev(stat->dev) || - !new_valid_dev(stat->rdev)) - return -EOVERFLOW; - - ino = stat->ino; - if (sizeof(ino) < sizeof(stat->ino) && ino != stat->ino) - return -EOVERFLOW; - - err = access_ok(VERIFY_WRITE, statbuf, sizeof(*statbuf)) ? 0 : -EFAULT; - err |= __put_user(new_encode_dev(stat->dev), &statbuf->st_dev); - err |= __put_user(ino, &statbuf->st_ino); - err |= __put_user(stat->mode, &statbuf->st_mode); - err |= __put_user(stat->nlink, &statbuf->st_nlink); - err |= __put_user(stat->uid, &statbuf->st_uid); - err |= __put_user(stat->gid, &statbuf->st_gid); - err |= __put_user(new_encode_dev(stat->rdev), &statbuf->st_rdev); - err |= __put_user(stat->size, &statbuf->st_size); - err |= __put_user(stat->atime.tv_sec, &statbuf->st_atime); - err |= __put_user(stat->atime.tv_nsec, &statbuf->st_atime_nsec); - err |= __put_user(stat->mtime.tv_sec, &statbuf->st_mtime); - err |= __put_user(stat->mtime.tv_nsec, &statbuf->st_mtime_nsec); - err |= __put_user(stat->ctime.tv_sec, &statbuf->st_ctime); - err |= __put_user(stat->ctime.tv_nsec, &statbuf->st_ctime_nsec); - err |= __put_user(stat->blksize, &statbuf->st_blksize); - err |= __put_user(stat->blocks, &statbuf->st_blocks); - err |= __put_user(0, &statbuf->__unused4[0]); - err |= __put_user(0, &statbuf->__unused4[1]); - - return err; -} - /* Note: it is necessary to treat option as an unsigned int, * with the corresponding cast to a signed int to insure that the * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c index 98e246dc023..9b471d785ec 100644 --- a/arch/s390/kernel/compat_linux.c +++ b/arch/s390/kernel/compat_linux.c @@ -362,41 +362,6 @@ asmlinkage long sys32_ftruncate64(unsigned int fd, unsigned long high, unsigned return sys_ftruncate(fd, (high << 32) | low); } -int cp_compat_stat(struct kstat *stat, struct compat_stat __user *statbuf) -{ - compat_ino_t ino; - int err; - - if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev)) - return -EOVERFLOW; - - ino = stat->ino; - if (sizeof(ino) < sizeof(stat->ino) && ino != stat->ino) - return -EOVERFLOW; - - err = put_user(old_encode_dev(stat->dev), &statbuf->st_dev); - err |= put_user(stat->ino, &statbuf->st_ino); - err |= put_user(stat->mode, &statbuf->st_mode); - err |= put_user(stat->nlink, &statbuf->st_nlink); - err |= put_user(high2lowuid(stat->uid), &statbuf->st_uid); - err |= put_user(high2lowgid(stat->gid), &statbuf->st_gid); - err |= put_user(old_encode_dev(stat->rdev), &statbuf->st_rdev); - err |= put_user(stat->size, &statbuf->st_size); - err |= put_user(stat->atime.tv_sec, &statbuf->st_atime); - err |= put_user(stat->atime.tv_nsec, &statbuf->st_atime_nsec); - err |= put_user(stat->mtime.tv_sec, &statbuf->st_mtime); - err |= put_user(stat->mtime.tv_nsec, &statbuf->st_mtime_nsec); - err |= put_user(stat->ctime.tv_sec, &statbuf->st_ctime); - err |= put_user(stat->ctime.tv_nsec, &statbuf->st_ctime_nsec); - err |= put_user(stat->blksize, &statbuf->st_blksize); - err |= put_user(stat->blocks, &statbuf->st_blocks); -/* fixme - err |= put_user(0, &statbuf->__unused4[0]); - err |= put_user(0, &statbuf->__unused4[1]); -*/ - return err; -} - asmlinkage long sys32_sched_rr_get_interval(compat_pid_t pid, struct compat_timespec __user *interval) { diff --git a/arch/sparc64/kernel/sys_sparc32.c b/arch/sparc64/kernel/sys_sparc32.c index 3320c9d0075..73a33dc3bcc 100644 --- a/arch/sparc64/kernel/sys_sparc32.c +++ b/arch/sparc64/kernel/sys_sparc32.c @@ -148,41 +148,6 @@ asmlinkage long sys32_ftruncate64(unsigned int fd, unsigned long high, unsigned return sys_ftruncate(fd, (high << 32) | low); } -int cp_compat_stat(struct kstat *stat, struct compat_stat __user *statbuf) -{ - compat_ino_t ino; - int err; - - if (stat->size > MAX_NON_LFS || !old_valid_dev(stat->dev) || - !old_valid_dev(stat->rdev)) - return -EOVERFLOW; - - ino = stat->ino; - if (sizeof(ino) < sizeof(stat->ino) && ino != stat->ino) - return -EOVERFLOW; - - err = put_user(old_encode_dev(stat->dev), &statbuf->st_dev); - err |= put_user(stat->ino, &statbuf->st_ino); - err |= put_user(stat->mode, &statbuf->st_mode); - err |= put_user(stat->nlink, &statbuf->st_nlink); - err |= put_user(high2lowuid(stat->uid), &statbuf->st_uid); - err |= put_user(high2lowgid(stat->gid), &statbuf->st_gid); - err |= put_user(old_encode_dev(stat->rdev), &statbuf->st_rdev); - err |= put_user(stat->size, &statbuf->st_size); - err |= put_user(stat->atime.tv_sec, &statbuf->st_atime); - err |= put_user(stat->atime.tv_nsec, &statbuf->st_atime_nsec); - err |= put_user(stat->mtime.tv_sec, &statbuf->st_mtime); - err |= put_user(stat->mtime.tv_nsec, &statbuf->st_mtime_nsec); - err |= put_user(stat->ctime.tv_sec, &statbuf->st_ctime); - err |= put_user(stat->ctime.tv_nsec, &statbuf->st_ctime_nsec); - err |= put_user(stat->blksize, &statbuf->st_blksize); - err |= put_user(stat->blocks, &statbuf->st_blocks); - err |= put_user(0, &statbuf->__unused4[0]); - err |= put_user(0, &statbuf->__unused4[1]); - - return err; -} - static int cp_compat_stat64(struct kstat *stat, struct compat_stat64 __user *statbuf) { diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index beda4232ce6..4d3ad8d78a4 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -49,41 +49,6 @@ #define AA(__x) ((unsigned long)(__x)) -int cp_compat_stat(struct kstat *kbuf, struct compat_stat __user *ubuf) -{ - compat_ino_t ino; - - typeof(ubuf->st_uid) uid = 0; - typeof(ubuf->st_gid) gid = 0; - SET_UID(uid, kbuf->uid); - SET_GID(gid, kbuf->gid); - if (!old_valid_dev(kbuf->dev) || !old_valid_dev(kbuf->rdev)) - return -EOVERFLOW; - if (kbuf->size >= 0x7fffffff) - return -EOVERFLOW; - ino = kbuf->ino; - if (sizeof(ino) < sizeof(kbuf->ino) && ino != kbuf->ino) - return -EOVERFLOW; - if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct compat_stat)) || - __put_user(old_encode_dev(kbuf->dev), &ubuf->st_dev) || - __put_user(ino, &ubuf->st_ino) || - __put_user(kbuf->mode, &ubuf->st_mode) || - __put_user(kbuf->nlink, &ubuf->st_nlink) || - __put_user(uid, &ubuf->st_uid) || - __put_user(gid, &ubuf->st_gid) || - __put_user(old_encode_dev(kbuf->rdev), &ubuf->st_rdev) || - __put_user(kbuf->size, &ubuf->st_size) || - __put_user(kbuf->atime.tv_sec, &ubuf->st_atime) || - __put_user(kbuf->atime.tv_nsec, &ubuf->st_atime_nsec) || - __put_user(kbuf->mtime.tv_sec, &ubuf->st_mtime) || - __put_user(kbuf->mtime.tv_nsec, &ubuf->st_mtime_nsec) || - __put_user(kbuf->ctime.tv_sec, &ubuf->st_ctime) || - __put_user(kbuf->ctime.tv_nsec, &ubuf->st_ctime_nsec) || - __put_user(kbuf->blksize, &ubuf->st_blksize) || - __put_user(kbuf->blocks, &ubuf->st_blocks)) - return -EFAULT; - return 0; -} asmlinkage long sys32_truncate64(char __user *filename, unsigned long offset_low, diff --git a/fs/compat.c b/fs/compat.c index aae13d31612..5f9ec449c79 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -137,6 +137,45 @@ asmlinkage long compat_sys_utimes(char __user *filename, struct compat_timeval _ return compat_sys_futimesat(AT_FDCWD, filename, t); } +static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) +{ + compat_ino_t ino = stat->ino; + typeof(ubuf->st_uid) uid = 0; + typeof(ubuf->st_gid) gid = 0; + int err; + + SET_UID(uid, stat->uid); + SET_GID(gid, stat->gid); + + if ((u64) stat->size > MAX_NON_LFS || + !old_valid_dev(stat->dev) || + !old_valid_dev(stat->rdev)) + return -EOVERFLOW; + if (sizeof(ino) < sizeof(stat->ino) && ino != stat->ino) + return -EOVERFLOW; + + if (clear_user(ubuf, sizeof(*ubuf))) + return -EFAULT; + + err = __put_user(old_encode_dev(stat->dev), &ubuf->st_dev); + err |= __put_user(ino, &ubuf->st_ino); + err |= __put_user(stat->mode, &ubuf->st_mode); + err |= __put_user(stat->nlink, &ubuf->st_nlink); + err |= __put_user(uid, &ubuf->st_uid); + err |= __put_user(gid, &ubuf->st_gid); + err |= __put_user(old_encode_dev(stat->rdev), &ubuf->st_rdev); + err |= __put_user(stat->size, &ubuf->st_size); + err |= __put_user(stat->atime.tv_sec, &ubuf->st_atime); + err |= __put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec); + err |= __put_user(stat->mtime.tv_sec, &ubuf->st_mtime); + err |= __put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec); + err |= __put_user(stat->ctime.tv_sec, &ubuf->st_ctime); + err |= __put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec); + err |= __put_user(stat->blksize, &ubuf->st_blksize); + err |= __put_user(stat->blocks, &ubuf->st_blocks); + return err; +} + asmlinkage long compat_sys_newstat(char __user * filename, struct compat_stat __user *statbuf) { diff --git a/include/linux/compat.h b/include/linux/compat.h index cf8d11cad5a..999dddd8d93 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -78,7 +78,6 @@ typedef struct { compat_sigset_word sig[_COMPAT_NSIG_WORDS]; } compat_sigset_t; -extern int cp_compat_stat(struct kstat *, struct compat_stat __user *); extern int get_compat_timespec(struct timespec *, const struct compat_timespec __user *); extern int put_compat_timespec(const struct timespec *, struct compat_timespec __user *); -- cgit v1.2.3-70-g09d2 From b418da16dd44810e5d5a22bba377cca80512a524 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 15 Oct 2008 22:02:06 -0700 Subject: compat: generic compat get/settimeofday Nothing arch specific in get/settimeofday. The details of the timeval conversion varied a little from arch to arch, but all with the same results. Also add an extern declaration for sys_tz to linux/time.h because externs in .c files are fowned upon. I'll kill the externs in various other files in a sparate patch. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Christoph Hellwig Acked-by: David S. Miller [ sparc bits ] Cc: "Luck, Tony" Cc: Ralf Baechle Acked-by: Kyle McMartin Cc: Matthew Wilcox Cc: Grant Grundler Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/ia32/ia32_entry.S | 4 +-- arch/ia64/ia32/sys_ia32.c | 56 ------------------------------- arch/mips/kernel/linux32.c | 66 ------------------------------------- arch/mips/kernel/scall64-n32.S | 4 +-- arch/mips/kernel/scall64-o32.S | 4 +-- arch/parisc/kernel/sys_parisc32.c | 58 --------------------------------- arch/parisc/kernel/syscall_table.S | 4 +-- arch/powerpc/kernel/sys_ppc32.c | 63 ----------------------------------- arch/s390/kernel/compat_linux.c | 67 -------------------------------------- arch/s390/kernel/compat_linux.h | 4 --- arch/s390/kernel/compat_wrapper.S | 12 +++---- arch/s390/kernel/syscalls.S | 4 +-- arch/sparc64/kernel/sys_sparc32.c | 62 ----------------------------------- arch/sparc64/kernel/systbls.S | 4 +-- arch/x86/ia32/ia32entry.S | 4 +-- arch/x86/ia32/sys_ia32.c | 64 ------------------------------------ include/linux/compat.h | 5 +++ include/linux/time.h | 2 ++ kernel/compat.c | 58 +++++++++++++++++++++++++++++++++ 19 files changed, 85 insertions(+), 460 deletions(-) (limited to 'arch/x86') diff --git a/arch/ia64/ia32/ia32_entry.S b/arch/ia64/ia32/ia32_entry.S index ff88c48c5d1..53505bb0477 100644 --- a/arch/ia64/ia32/ia32_entry.S +++ b/arch/ia64/ia32/ia32_entry.S @@ -251,8 +251,8 @@ ia32_syscall_table: data8 compat_sys_setrlimit /* 75 */ data8 compat_sys_old_getrlimit data8 compat_sys_getrusage - data8 sys32_gettimeofday - data8 sys32_settimeofday + data8 compat_sys_gettimeofday + data8 compat_sys_settimeofday data8 sys32_getgroups16 /* 80 */ data8 sys32_setgroups16 data8 sys32_old_select diff --git a/arch/ia64/ia32/sys_ia32.c b/arch/ia64/ia32/sys_ia32.c index 2362a8eefb3..f4430bb4bbd 100644 --- a/arch/ia64/ia32/sys_ia32.c +++ b/arch/ia64/ia32/sys_ia32.c @@ -1113,68 +1113,12 @@ sys32_pipe (int __user *fd) return retval; } -static inline long -get_tv32 (struct timeval *o, struct compat_timeval __user *i) -{ - return (!access_ok(VERIFY_READ, i, sizeof(*i)) || - (__get_user(o->tv_sec, &i->tv_sec) | __get_user(o->tv_usec, &i->tv_usec))); -} - -static inline long -put_tv32 (struct compat_timeval __user *o, struct timeval *i) -{ - return (!access_ok(VERIFY_WRITE, o, sizeof(*o)) || - (__put_user(i->tv_sec, &o->tv_sec) | __put_user(i->tv_usec, &o->tv_usec))); -} - asmlinkage unsigned long sys32_alarm (unsigned int seconds) { return alarm_setitimer(seconds); } -/* Translations due to time_t size differences. Which affects all - sorts of things, like timeval and itimerval. */ - -extern struct timezone sys_tz; - -asmlinkage long -sys32_gettimeofday (struct compat_timeval __user *tv, struct timezone __user *tz) -{ - if (tv) { - struct timeval ktv; - do_gettimeofday(&ktv); - if (put_tv32(tv, &ktv)) - return -EFAULT; - } - if (tz) { - if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) - return -EFAULT; - } - return 0; -} - -asmlinkage long -sys32_settimeofday (struct compat_timeval __user *tv, struct timezone __user *tz) -{ - struct timeval ktv; - struct timespec kts; - struct timezone ktz; - - if (tv) { - if (get_tv32(&ktv, tv)) - return -EFAULT; - kts.tv_sec = ktv.tv_sec; - kts.tv_nsec = ktv.tv_usec * 1000; - } - if (tz) { - if (copy_from_user(&ktz, tz, sizeof(ktz))) - return -EFAULT; - } - - return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); -} - struct sel_arg_struct { unsigned int n; unsigned int inp; diff --git a/arch/mips/kernel/linux32.c b/arch/mips/kernel/linux32.c index 89223a9bff2..aa2c55e3b55 100644 --- a/arch/mips/kernel/linux32.c +++ b/arch/mips/kernel/linux32.c @@ -133,72 +133,6 @@ asmlinkage long sys32_ftruncate64(unsigned int fd, unsigned long __dummy, return sys_ftruncate(fd, merge_64(a2, a3)); } -static inline long -get_tv32(struct timeval *o, struct compat_timeval __user *i) -{ - return (!access_ok(VERIFY_READ, i, sizeof(*i)) || - (__get_user(o->tv_sec, &i->tv_sec) | - __get_user(o->tv_usec, &i->tv_usec))); -} - -static inline long -put_tv32(struct compat_timeval __user *o, struct timeval *i) -{ - return (!access_ok(VERIFY_WRITE, o, sizeof(*o)) || - (__put_user(i->tv_sec, &o->tv_sec) | - __put_user(i->tv_usec, &o->tv_usec))); -} - -extern struct timezone sys_tz; - -asmlinkage int -sys32_gettimeofday(struct compat_timeval __user *tv, struct timezone __user *tz) -{ - if (tv) { - struct timeval ktv; - do_gettimeofday(&ktv); - if (put_tv32(tv, &ktv)) - return -EFAULT; - } - if (tz) { - if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) - return -EFAULT; - } - return 0; -} - -static inline long get_ts32(struct timespec *o, struct compat_timeval __user *i) -{ - long usec; - - if (!access_ok(VERIFY_READ, i, sizeof(*i))) - return -EFAULT; - if (__get_user(o->tv_sec, &i->tv_sec)) - return -EFAULT; - if (__get_user(usec, &i->tv_usec)) - return -EFAULT; - o->tv_nsec = usec * 1000; - return 0; -} - -asmlinkage int -sys32_settimeofday(struct compat_timeval __user *tv, struct timezone __user *tz) -{ - struct timespec kts; - struct timezone ktz; - - if (tv) { - if (get_ts32(&kts, tv)) - return -EFAULT; - } - if (tz) { - if (copy_from_user(&ktz, tz, sizeof(ktz))) - return -EFAULT; - } - - return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); -} - asmlinkage int sys32_llseek(unsigned int fd, unsigned int offset_high, unsigned int offset_low, loff_t __user * result, unsigned int origin) diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S index 324c5499dec..e266b3aa656 100644 --- a/arch/mips/kernel/scall64-n32.S +++ b/arch/mips/kernel/scall64-n32.S @@ -214,7 +214,7 @@ EXPORT(sysn32_call_table) PTR sys_fchown PTR sys_lchown PTR sys_umask - PTR sys32_gettimeofday + PTR compat_sys_gettimeofday PTR compat_sys_getrlimit /* 6095 */ PTR compat_sys_getrusage PTR compat_sys_sysinfo @@ -279,7 +279,7 @@ EXPORT(sysn32_call_table) PTR sys_chroot PTR sys_sync PTR sys_acct - PTR sys32_settimeofday + PTR compat_sys_settimeofday PTR compat_sys_mount /* 6160 */ PTR sys_umount PTR sys_swapon diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S index 85fedac99a5..6c7ef8313eb 100644 --- a/arch/mips/kernel/scall64-o32.S +++ b/arch/mips/kernel/scall64-o32.S @@ -283,8 +283,8 @@ sys_call_table: PTR compat_sys_setrlimit /* 4075 */ PTR compat_sys_getrlimit PTR compat_sys_getrusage - PTR sys32_gettimeofday - PTR sys32_settimeofday + PTR compat_sys_gettimeofday + PTR compat_sys_settimeofday PTR sys_getgroups /* 4080 */ PTR sys_setgroups PTR sys_ni_syscall /* old_select */ diff --git a/arch/parisc/kernel/sys_parisc32.c b/arch/parisc/kernel/sys_parisc32.c index 2c3af17e049..0838155b7a8 100644 --- a/arch/parisc/kernel/sys_parisc32.c +++ b/arch/parisc/kernel/sys_parisc32.c @@ -179,64 +179,6 @@ asmlinkage long sys32_sched_rr_get_interval(pid_t pid, return ret; } -static int -put_compat_timeval(struct compat_timeval __user *u, struct timeval *t) -{ - struct compat_timeval t32; - t32.tv_sec = t->tv_sec; - t32.tv_usec = t->tv_usec; - return copy_to_user(u, &t32, sizeof t32); -} - -static inline long get_ts32(struct timespec *o, struct compat_timeval __user *i) -{ - long usec; - - if (__get_user(o->tv_sec, &i->tv_sec)) - return -EFAULT; - if (__get_user(usec, &i->tv_usec)) - return -EFAULT; - o->tv_nsec = usec * 1000; - return 0; -} - -asmlinkage int -sys32_gettimeofday(struct compat_timeval __user *tv, struct timezone __user *tz) -{ - extern void do_gettimeofday(struct timeval *tv); - - if (tv) { - struct timeval ktv; - do_gettimeofday(&ktv); - if (put_compat_timeval(tv, &ktv)) - return -EFAULT; - } - if (tz) { - extern struct timezone sys_tz; - if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) - return -EFAULT; - } - return 0; -} - -asmlinkage -int sys32_settimeofday(struct compat_timeval __user *tv, struct timezone __user *tz) -{ - struct timespec kts; - struct timezone ktz; - - if (tv) { - if (get_ts32(&kts, tv)) - return -EFAULT; - } - if (tz) { - if (copy_from_user(&ktz, tz, sizeof(ktz))) - return -EFAULT; - } - - return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); -} - /*** copied from mips64 ***/ /* * Ooo, nasty. We need here to frob 32-bit unsigned longs to diff --git a/arch/parisc/kernel/syscall_table.S b/arch/parisc/kernel/syscall_table.S index 6b5ac38f5a9..c7e59f54881 100644 --- a/arch/parisc/kernel/syscall_table.S +++ b/arch/parisc/kernel/syscall_table.S @@ -149,8 +149,8 @@ ENTRY_COMP(getrlimit) ENTRY_COMP(getrusage) /* struct timeval and timezone are maybe?? consistent wide and narrow */ - ENTRY_DIFF(gettimeofday) - ENTRY_DIFF(settimeofday) + ENTRY_COMP(gettimeofday) + ENTRY_COMP(settimeofday) ENTRY_SAME(getgroups) /* 80 */ ENTRY_SAME(setgroups) /* struct socketaddr... */ diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c index d00599bb24a..bb1cfcfdbbb 100644 --- a/arch/powerpc/kernel/sys_ppc32.c +++ b/arch/powerpc/kernel/sys_ppc32.c @@ -71,69 +71,6 @@ asmlinkage long compat_sys_sysfs(u32 option, u32 arg1, u32 arg2) return sys_sysfs((int)option, arg1, arg2); } -static inline long get_ts32(struct timespec *o, struct compat_timeval __user *i) -{ - long usec; - - if (!access_ok(VERIFY_READ, i, sizeof(*i))) - return -EFAULT; - if (__get_user(o->tv_sec, &i->tv_sec)) - return -EFAULT; - if (__get_user(usec, &i->tv_usec)) - return -EFAULT; - o->tv_nsec = usec * 1000; - return 0; -} - -static inline long put_tv32(struct compat_timeval __user *o, struct timeval *i) -{ - return (!access_ok(VERIFY_WRITE, o, sizeof(*o)) || - (__put_user(i->tv_sec, &o->tv_sec) | - __put_user(i->tv_usec, &o->tv_usec))); -} - - - - -/* Translations due to time_t size differences. Which affects all - sorts of things, like timeval and itimerval. */ -extern struct timezone sys_tz; - -asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, struct timezone __user *tz) -{ - if (tv) { - struct timeval ktv; - do_gettimeofday(&ktv); - if (put_tv32(tv, &ktv)) - return -EFAULT; - } - if (tz) { - if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) - return -EFAULT; - } - - return 0; -} - - - -asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, struct timezone __user *tz) -{ - struct timespec kts; - struct timezone ktz; - - if (tv) { - if (get_ts32(&kts, tv)) - return -EFAULT; - } - if (tz) { - if (copy_from_user(&ktz, tz, sizeof(ktz))) - return -EFAULT; - } - - return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); -} - #ifdef CONFIG_SYSVIPC long compat_sys_ipc(u32 call, u32 first, u32 second, u32 third, compat_uptr_t ptr, u32 fifth) diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c index 9b471d785ec..4646382af34 100644 --- a/arch/s390/kernel/compat_linux.c +++ b/arch/s390/kernel/compat_linux.c @@ -279,22 +279,6 @@ asmlinkage long sys32_getegid16(void) return high2lowgid(current->egid); } -/* 32-bit timeval and related flotsam. */ - -static inline long get_tv32(struct timeval *o, struct compat_timeval __user *i) -{ - return (!access_ok(VERIFY_READ, o, sizeof(*o)) || - (__get_user(o->tv_sec, &i->tv_sec) || - __get_user(o->tv_usec, &i->tv_usec))); -} - -static inline long put_tv32(struct compat_timeval __user *o, struct timeval *i) -{ - return (!access_ok(VERIFY_WRITE, o, sizeof(*o)) || - (__put_user(i->tv_sec, &o->tv_sec) || - __put_user(i->tv_usec, &o->tv_usec))); -} - /* * sys32_ipc() is the de-multiplexer for the SysV IPC calls in 32bit emulation. * @@ -522,57 +506,6 @@ sys32_delete_module(const char __user *name_user, unsigned int flags) #endif /* CONFIG_MODULES */ -/* Translations due to time_t size differences. Which affects all - sorts of things, like timeval and itimerval. */ - -extern struct timezone sys_tz; - -asmlinkage long sys32_gettimeofday(struct compat_timeval __user *tv, struct timezone __user *tz) -{ - if (tv) { - struct timeval ktv; - do_gettimeofday(&ktv); - if (put_tv32(tv, &ktv)) - return -EFAULT; - } - if (tz) { - if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) - return -EFAULT; - } - return 0; -} - -static inline long get_ts32(struct timespec *o, struct compat_timeval __user *i) -{ - long usec; - - if (!access_ok(VERIFY_READ, i, sizeof(*i))) - return -EFAULT; - if (__get_user(o->tv_sec, &i->tv_sec)) - return -EFAULT; - if (__get_user(usec, &i->tv_usec)) - return -EFAULT; - o->tv_nsec = usec * 1000; - return 0; -} - -asmlinkage long sys32_settimeofday(struct compat_timeval __user *tv, struct timezone __user *tz) -{ - struct timespec kts; - struct timezone ktz; - - if (tv) { - if (get_ts32(&kts, tv)) - return -EFAULT; - } - if (tz) { - if (copy_from_user(&ktz, tz, sizeof(ktz))) - return -EFAULT; - } - - return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); -} - asmlinkage long sys32_pread64(unsigned int fd, char __user *ubuf, size_t count, u32 poshi, u32 poslo) { diff --git a/arch/s390/kernel/compat_linux.h b/arch/s390/kernel/compat_linux.h index 05f8516366a..836a2884290 100644 --- a/arch/s390/kernel/compat_linux.h +++ b/arch/s390/kernel/compat_linux.h @@ -202,10 +202,6 @@ long sys32_execve(void); long sys32_init_module(void __user *umod, unsigned long len, const char __user *uargs); long sys32_delete_module(const char __user *name_user, unsigned int flags); -long sys32_gettimeofday(struct compat_timeval __user *tv, - struct timezone __user *tz); -long sys32_settimeofday(struct compat_timeval __user *tv, - struct timezone __user *tz); long sys32_pread64(unsigned int fd, char __user *ubuf, size_t count, u32 poshi, u32 poslo); long sys32_pwrite64(unsigned int fd, const char __user *ubuf, diff --git a/arch/s390/kernel/compat_wrapper.S b/arch/s390/kernel/compat_wrapper.S index ee51ca9e23b..fc2c97197a5 100644 --- a/arch/s390/kernel/compat_wrapper.S +++ b/arch/s390/kernel/compat_wrapper.S @@ -332,17 +332,17 @@ compat_sys_getrusage_wrapper: llgtr %r3,%r3 # struct rusage_emu31 * jg compat_sys_getrusage # branch to system call - .globl sys32_gettimeofday_wrapper -sys32_gettimeofday_wrapper: + .globl compat_sys_gettimeofday_wrapper +compat_sys_gettimeofday_wrapper: llgtr %r2,%r2 # struct timeval_emu31 * llgtr %r3,%r3 # struct timezone * - jg sys32_gettimeofday # branch to system call + jg compat_sys_gettimeofday # branch to system call - .globl sys32_settimeofday_wrapper -sys32_settimeofday_wrapper: + .globl compat_sys_settimeofday_wrapper +compat_sys_settimeofday_wrapper: llgtr %r2,%r2 # struct timeval_emu31 * llgtr %r3,%r3 # struct timezone * - jg sys32_settimeofday # branch to system call + jg compat_sys_settimeofday # branch to system call .globl sys32_getgroups16_wrapper sys32_getgroups16_wrapper: diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S index 3ae303914b4..2d61787949d 100644 --- a/arch/s390/kernel/syscalls.S +++ b/arch/s390/kernel/syscalls.S @@ -86,8 +86,8 @@ SYSCALL(sys_sethostname,sys_sethostname,sys32_sethostname_wrapper) SYSCALL(sys_setrlimit,sys_setrlimit,compat_sys_setrlimit_wrapper) /* 75 */ SYSCALL(sys_old_getrlimit,sys_getrlimit,compat_sys_old_getrlimit_wrapper) SYSCALL(sys_getrusage,sys_getrusage,compat_sys_getrusage_wrapper) -SYSCALL(sys_gettimeofday,sys_gettimeofday,sys32_gettimeofday_wrapper) -SYSCALL(sys_settimeofday,sys_settimeofday,sys32_settimeofday_wrapper) +SYSCALL(sys_gettimeofday,sys_gettimeofday,compat_sys_gettimeofday_wrapper) +SYSCALL(sys_settimeofday,sys_settimeofday,compat_sys_settimeofday_wrapper) SYSCALL(sys_getgroups16,sys_ni_syscall,sys32_getgroups16_wrapper) /* 80 old getgroups16 syscall */ SYSCALL(sys_setgroups16,sys_ni_syscall,sys32_setgroups16_wrapper) /* old setgroups16 syscall */ NI_SYSCALL /* old select syscall */ diff --git a/arch/sparc64/kernel/sys_sparc32.c b/arch/sparc64/kernel/sys_sparc32.c index 73a33dc3bcc..e800503879e 100644 --- a/arch/sparc64/kernel/sys_sparc32.c +++ b/arch/sparc64/kernel/sys_sparc32.c @@ -58,15 +58,6 @@ #include #include -/* 32-bit timeval and related flotsam. */ - -static inline long put_tv32(struct compat_timeval __user *o, struct timeval *i) -{ - return (!access_ok(VERIFY_WRITE, o, sizeof(*o)) || - (__put_user(i->tv_sec, &o->tv_sec) | - __put_user(i->tv_usec, &o->tv_usec))); -} - #ifdef CONFIG_SYSVIPC asmlinkage long compat_sys_ipc(u32 call, u32 first, u32 second, u32 third, compat_uptr_t ptr, u32 fifth) { @@ -487,59 +478,6 @@ asmlinkage long sys32_delete_module(const char __user *name_user) #endif /* CONFIG_MODULES */ -/* Translations due to time_t size differences. Which affects all - sorts of things, like timeval and itimerval. */ - -extern struct timezone sys_tz; - -asmlinkage long sys32_gettimeofday(struct compat_timeval __user *tv, - struct timezone __user *tz) -{ - if (tv) { - struct timeval ktv; - do_gettimeofday(&ktv); - if (put_tv32(tv, &ktv)) - return -EFAULT; - } - if (tz) { - if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) - return -EFAULT; - } - return 0; -} - -static inline long get_ts32(struct timespec *o, struct compat_timeval __user *i) -{ - long usec; - - if (!access_ok(VERIFY_READ, i, sizeof(*i))) - return -EFAULT; - if (__get_user(o->tv_sec, &i->tv_sec)) - return -EFAULT; - if (__get_user(usec, &i->tv_usec)) - return -EFAULT; - o->tv_nsec = usec * 1000; - return 0; -} - -asmlinkage long sys32_settimeofday(struct compat_timeval __user *tv, - struct timezone __user *tz) -{ - struct timespec kts; - struct timezone ktz; - - if (tv) { - if (get_ts32(&kts, tv)) - return -EFAULT; - } - if (tz) { - if (copy_from_user(&ktz, tz, sizeof(ktz))) - return -EFAULT; - } - - return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); -} - asmlinkage compat_ssize_t sys32_pread64(unsigned int fd, char __user *ubuf, compat_size_t count, diff --git a/arch/sparc64/kernel/systbls.S b/arch/sparc64/kernel/systbls.S index 5daee4b04dd..b2fa4c16363 100644 --- a/arch/sparc64/kernel/systbls.S +++ b/arch/sparc64/kernel/systbls.S @@ -41,8 +41,8 @@ sys_call_table32: /*100*/ .word sys32_getpriority, sys32_rt_sigreturn, sys32_rt_sigaction, sys32_rt_sigprocmask, sys32_rt_sigpending .word compat_sys_rt_sigtimedwait, sys32_rt_sigqueueinfo, compat_sys_rt_sigsuspend, sys_setresuid, sys_getresuid /*110*/ .word sys_setresgid, sys_getresgid, sys_setregid, sys_nis_syscall, sys_nis_syscall - .word sys32_getgroups, sys32_gettimeofday, sys32_getrusage, sys_nis_syscall, sys_getcwd -/*120*/ .word compat_sys_readv, compat_sys_writev, sys32_settimeofday, sys_fchown16, sys_fchmod + .word sys32_getgroups, compat_sys_gettimeofday, sys32_getrusage, sys_nis_syscall, sys_getcwd +/*120*/ .word compat_sys_readv, compat_sys_writev, compat_sys_settimeofday, sys_fchown16, sys_fchmod .word sys_nis_syscall, sys_setreuid16, sys_setregid16, sys_rename, sys_truncate /*130*/ .word sys_ftruncate, sys_flock, compat_sys_lstat64, sys_nis_syscall, sys_nis_syscall .word sys_nis_syscall, sys32_mkdir, sys_rmdir, compat_sys_utimes, compat_sys_stat64 diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index eb4314768bf..256b00b6189 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -571,8 +571,8 @@ ia32_sys_call_table: .quad compat_sys_setrlimit /* 75 */ .quad compat_sys_old_getrlimit /* old_getrlimit */ .quad compat_sys_getrusage - .quad sys32_gettimeofday - .quad sys32_settimeofday + .quad compat_sys_gettimeofday + .quad compat_sys_settimeofday .quad sys_getgroups16 /* 80 */ .quad sys_setgroups16 .quad sys32_old_select diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 4d3ad8d78a4..2e09dcd3c0a 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -367,75 +367,11 @@ asmlinkage long sys32_rt_sigprocmask(int how, compat_sigset_t __user *set, return 0; } -static inline long get_tv32(struct timeval *o, struct compat_timeval __user *i) -{ - int err = -EFAULT; - - if (access_ok(VERIFY_READ, i, sizeof(*i))) { - err = __get_user(o->tv_sec, &i->tv_sec); - err |= __get_user(o->tv_usec, &i->tv_usec); - } - return err; -} - -static inline long put_tv32(struct compat_timeval __user *o, struct timeval *i) -{ - int err = -EFAULT; - - if (access_ok(VERIFY_WRITE, o, sizeof(*o))) { - err = __put_user(i->tv_sec, &o->tv_sec); - err |= __put_user(i->tv_usec, &o->tv_usec); - } - return err; -} - asmlinkage long sys32_alarm(unsigned int seconds) { return alarm_setitimer(seconds); } -/* - * Translations due to time_t size differences. Which affects all - * sorts of things, like timeval and itimerval. - */ -asmlinkage long sys32_gettimeofday(struct compat_timeval __user *tv, - struct timezone __user *tz) -{ - if (tv) { - struct timeval ktv; - - do_gettimeofday(&ktv); - if (put_tv32(tv, &ktv)) - return -EFAULT; - } - if (tz) { - if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) - return -EFAULT; - } - return 0; -} - -asmlinkage long sys32_settimeofday(struct compat_timeval __user *tv, - struct timezone __user *tz) -{ - struct timeval ktv; - struct timespec kts; - struct timezone ktz; - - if (tv) { - if (get_tv32(&ktv, tv)) - return -EFAULT; - kts.tv_sec = ktv.tv_sec; - kts.tv_nsec = ktv.tv_usec * NSEC_PER_USEC; - } - if (tz) { - if (copy_from_user(&ktz, tz, sizeof(ktz))) - return -EFAULT; - } - - return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); -} - struct sel_arg_struct { unsigned int n; unsigned int inp; diff --git a/include/linux/compat.h b/include/linux/compat.h index 999dddd8d93..f061a1ea1b7 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -234,6 +234,11 @@ extern int get_compat_itimerspec(struct itimerspec *dst, extern int put_compat_itimerspec(struct compat_itimerspec __user *dst, const struct itimerspec *src); +asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, + struct timezone __user *tz); +asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, + struct timezone __user *tz); + asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp); extern int compat_printk(const char *fmt, ...); diff --git a/include/linux/time.h b/include/linux/time.h index e15206a7e82..51e883df0fa 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -29,6 +29,8 @@ struct timezone { #ifdef __KERNEL__ +extern struct timezone sys_tz; + /* Parameters used to convert the timespec values: */ #define MSEC_PER_SEC 1000L #define USEC_PER_MSEC 1000L diff --git a/kernel/compat.c b/kernel/compat.c index 32c254a8ab9..143990e48cb 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -26,6 +26,64 @@ #include +/* + * Note that the native side is already converted to a timespec, because + * that's what we want anyway. + */ +static int compat_get_timeval(struct timespec *o, + struct compat_timeval __user *i) +{ + long usec; + + if (get_user(o->tv_sec, &i->tv_sec) || + get_user(usec, &i->tv_usec)) + return -EFAULT; + o->tv_nsec = usec * 1000; + return 0; +} + +static int compat_put_timeval(struct compat_timeval __user *o, + struct timeval *i) +{ + return (put_user(i->tv_sec, &o->tv_sec) || + put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; +} + +asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, + struct timezone __user *tz) +{ + if (tv) { + struct timeval ktv; + do_gettimeofday(&ktv); + if (compat_put_timeval(tv, &ktv)) + return -EFAULT; + } + if (tz) { + if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) + return -EFAULT; + } + + return 0; +} + +asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, + struct timezone __user *tz) +{ + struct timespec kts; + struct timezone ktz; + + if (tv) { + if (compat_get_timeval(&kts, tv)) + return -EFAULT; + } + if (tz) { + if (copy_from_user(&ktz, tz, sizeof(ktz))) + return -EFAULT; + } + + return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); +} + int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) { return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || -- cgit v1.2.3-70-g09d2 From bdab0ba3d9ad8de257ee6236daf314723748fde6 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 15 Oct 2008 22:02:07 -0700 Subject: x86: rename iommu_num_pages function to iommu_nr_pages This series of patches re-introduces the iommu_num_pages function so that it can be used by each architecture specific IOMMU implementations. The series also changes IOMMU implementations for X86, Alpha, PowerPC and UltraSparc. The other implementations are not yet changed because the modifications required are not obvious and I can't test them on real hardware. This patch: This is a preparation patch for introducing a generic iommu_num_pages function. Signed-off-by: Joerg Roedel Cc: "David S. Miller" Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Ingo Molnar Cc: Thomas Gleixner Cc: FUJITA Tomonori Cc: Muli Ben-Yehuda Cc: Dave Airlie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/amd_iommu.c | 8 ++++---- arch/x86/kernel/pci-dma.c | 4 ++-- arch/x86/kernel/pci-gart_64.c | 8 ++++---- include/asm-x86/iommu.h | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 34e4d112b1e..10646acba9b 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -295,7 +295,7 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, u64 address, size_t size) { int s = 0; - unsigned pages = iommu_num_pages(address, size); + unsigned pages = iommu_nr_pages(address, size); address &= PAGE_MASK; @@ -679,7 +679,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, if (iommu->exclusion_start && iommu->exclusion_start < dma_dom->aperture_size) { unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; - int pages = iommu_num_pages(iommu->exclusion_start, + int pages = iommu_nr_pages(iommu->exclusion_start, iommu->exclusion_length); dma_ops_reserve_addresses(dma_dom, startpage, pages); } @@ -935,7 +935,7 @@ static dma_addr_t __map_single(struct device *dev, unsigned long align_mask = 0; int i; - pages = iommu_num_pages(paddr, size); + pages = iommu_nr_pages(paddr, size); paddr &= PAGE_MASK; if (align) @@ -980,7 +980,7 @@ static void __unmap_single(struct amd_iommu *iommu, if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size)) return; - pages = iommu_num_pages(dma_addr, size); + pages = iommu_nr_pages(dma_addr, size); dma_addr &= PAGE_MASK; start = dma_addr; diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 0a3824e837b..19262482021 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -125,13 +125,13 @@ void __init pci_iommu_alloc(void) pci_swiotlb_init(); } -unsigned long iommu_num_pages(unsigned long addr, unsigned long len) +unsigned long iommu_nr_pages(unsigned long addr, unsigned long len) { unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE); return size >> PAGE_SHIFT; } -EXPORT_SYMBOL(iommu_num_pages); +EXPORT_SYMBOL(iommu_nr_pages); #endif void *dma_generic_alloc_coherent(struct device *dev, size_t size, diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 145f1c83369..14f1b41348f 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -231,7 +231,7 @@ nonforced_iommu(struct device *dev, unsigned long addr, size_t size) static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, size_t size, int dir, unsigned long align_mask) { - unsigned long npages = iommu_num_pages(phys_mem, size); + unsigned long npages = iommu_nr_pages(phys_mem, size); unsigned long iommu_page = alloc_iommu(dev, npages, align_mask); int i; @@ -285,7 +285,7 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, return; iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; - npages = iommu_num_pages(dma_addr, size); + npages = iommu_nr_pages(dma_addr, size); for (i = 0; i < npages; i++) { iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; CLEAR_LEAK(iommu_page + i); @@ -368,7 +368,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start, } addr = phys_addr; - pages = iommu_num_pages(s->offset, s->length); + pages = iommu_nr_pages(s->offset, s->length); while (pages--) { iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); SET_LEAK(iommu_page); @@ -451,7 +451,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) seg_size += s->length; need = nextneed; - pages += iommu_num_pages(s->offset, s->length); + pages += iommu_nr_pages(s->offset, s->length); ps = s; } if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0) diff --git a/include/asm-x86/iommu.h b/include/asm-x86/iommu.h index 546ad3110fe..961e746da97 100644 --- a/include/asm-x86/iommu.h +++ b/include/asm-x86/iommu.h @@ -8,7 +8,7 @@ extern int force_iommu, no_iommu; extern int iommu_detected; extern int dmar_disabled; -extern unsigned long iommu_num_pages(unsigned long addr, unsigned long len); +extern unsigned long iommu_nr_pages(unsigned long addr, unsigned long len); #ifdef CONFIG_GART_IOMMU extern int gart_iommu_aperture; -- cgit v1.2.3-70-g09d2 From 1477b8e5f13266bbf0389baafd39a90ff29c5f61 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 15 Oct 2008 22:02:11 -0700 Subject: x86: convert GART driver to generic iommu_num_pages function Signed-off-by: Joerg Roedel Cc: Ingo Molnar Cc: Thomas Gleixner Cc: FUJITA Tomonori Cc: Muli Ben-Yehuda Cc: Dave Airlie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/pci-gart_64.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 14f1b41348f..e3f75bbcede 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -231,7 +231,7 @@ nonforced_iommu(struct device *dev, unsigned long addr, size_t size) static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, size_t size, int dir, unsigned long align_mask) { - unsigned long npages = iommu_nr_pages(phys_mem, size); + unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE); unsigned long iommu_page = alloc_iommu(dev, npages, align_mask); int i; @@ -285,7 +285,7 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, return; iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; - npages = iommu_nr_pages(dma_addr, size); + npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); for (i = 0; i < npages; i++) { iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; CLEAR_LEAK(iommu_page + i); @@ -368,7 +368,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start, } addr = phys_addr; - pages = iommu_nr_pages(s->offset, s->length); + pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE); while (pages--) { iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); SET_LEAK(iommu_page); @@ -451,7 +451,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) seg_size += s->length; need = nextneed; - pages += iommu_nr_pages(s->offset, s->length); + pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE); ps = s; } if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0) -- cgit v1.2.3-70-g09d2 From e3c449f526cebb8d287241c7e82faafd9709668b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 15 Oct 2008 22:02:11 -0700 Subject: x86, AMD IOMMU: convert driver to generic iommu_num_pages function Signed-off-by: Joerg Roedel Cc: Ingo Molnar Cc: Thomas Gleixner Cc: FUJITA Tomonori Cc: Muli Ben-Yehuda Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/amd_iommu.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 10646acba9b..a8fd9ebdc8e 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -295,7 +295,7 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, u64 address, size_t size) { int s = 0; - unsigned pages = iommu_nr_pages(address, size); + unsigned pages = iommu_num_pages(address, size, PAGE_SIZE); address &= PAGE_MASK; @@ -679,8 +679,9 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, if (iommu->exclusion_start && iommu->exclusion_start < dma_dom->aperture_size) { unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; - int pages = iommu_nr_pages(iommu->exclusion_start, - iommu->exclusion_length); + int pages = iommu_num_pages(iommu->exclusion_start, + iommu->exclusion_length, + PAGE_SIZE); dma_ops_reserve_addresses(dma_dom, startpage, pages); } @@ -935,7 +936,7 @@ static dma_addr_t __map_single(struct device *dev, unsigned long align_mask = 0; int i; - pages = iommu_nr_pages(paddr, size); + pages = iommu_num_pages(paddr, size, PAGE_SIZE); paddr &= PAGE_MASK; if (align) @@ -980,7 +981,7 @@ static void __unmap_single(struct amd_iommu *iommu, if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size)) return; - pages = iommu_nr_pages(dma_addr, size); + pages = iommu_num_pages(dma_addr, size, PAGE_SIZE); dma_addr &= PAGE_MASK; start = dma_addr; -- cgit v1.2.3-70-g09d2 From 036b4c50fe99a2f308f36561335b9904ab507972 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 15 Oct 2008 22:02:12 -0700 Subject: x86: convert Calgary IOMMU driver to generic iommu_num_pages function Signed-off-by: Joerg Roedel Cc: Ingo Molnar Cc: Thomas Gleixner Cc: FUJITA Tomonori Acked-by: Muli Ben-Yehuda Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/pci-calgary_64.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 080d1d27f37..e1e731d78f3 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -217,16 +217,6 @@ static inline unsigned long verify_bit_range(unsigned long* bitmap, #endif /* CONFIG_IOMMU_DEBUG */ -static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen) -{ - unsigned int npages; - - npages = PAGE_ALIGN(dma + dmalen) - (dma & PAGE_MASK); - npages >>= PAGE_SHIFT; - - return npages; -} - static inline int translation_enabled(struct iommu_table *tbl) { /* only PHBs with translation enabled have an IOMMU table */ @@ -408,7 +398,7 @@ static void calgary_unmap_sg(struct device *dev, if (dmalen == 0) break; - npages = num_dma_pages(dma, dmalen); + npages = iommu_num_pages(dma, dmalen, PAGE_SIZE); iommu_free(tbl, dma, npages); } } @@ -427,7 +417,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, BUG_ON(!sg_page(s)); vaddr = (unsigned long) sg_virt(s); - npages = num_dma_pages(vaddr, s->length); + npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE); entry = iommu_range_alloc(dev, tbl, npages); if (entry == bad_dma_address) { @@ -464,7 +454,7 @@ static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, struct iommu_table *tbl = find_iommu_table(dev); uaddr = (unsigned long)vaddr; - npages = num_dma_pages(uaddr, size); + npages = iommu_num_pages(uaddr, size, PAGE_SIZE); return iommu_alloc(dev, tbl, vaddr, npages, direction); } @@ -475,7 +465,7 @@ static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, struct iommu_table *tbl = find_iommu_table(dev); unsigned int npages; - npages = num_dma_pages(dma_handle, size); + npages = iommu_num_pages(dma_handle, size, PAGE_SIZE); iommu_free(tbl, dma_handle, npages); } -- cgit v1.2.3-70-g09d2 From 4d31a2b74c6d063362ae10ce3be3e80d8713bf23 Mon Sep 17 00:00:00 2001 From: Michal Januszewski Date: Wed, 15 Oct 2008 22:03:51 -0700 Subject: fbdev: ignore VESA modes if framebuffer does not support them Currently, it is possible to set a graphics VESA mode at boot time via the vga= parameter even when no framebuffer driver supporting this is configured. This could lead to the system booting with a black screen, without a usable console. Fix this problem by only allowing to set graphics modes at boot time if a supporting framebuffer driver is configured. Signed-off-by: Michal Januszewski Acked-by: Krzysztof Helt Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/boot/video-vesa.c | 9 ++++----- drivers/video/Kconfig | 11 +++++++++++ 2 files changed, 15 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c index 1e6fe0214c8..99b3079dc6a 100644 --- a/arch/x86/boot/video-vesa.c +++ b/arch/x86/boot/video-vesa.c @@ -88,14 +88,11 @@ static int vesa_probe(void) (vminfo.memory_layout == 4 || vminfo.memory_layout == 6) && vminfo.memory_planes == 1) { -#ifdef CONFIG_FB +#ifdef CONFIG_FB_BOOT_VESA_SUPPORT /* Graphics mode, color, linear frame buffer supported. Only register the mode if if framebuffer is configured, however, - otherwise the user will be left without a screen. - We don't require CONFIG_FB_VESA, however, since - some of the other framebuffer drivers can use - this mode-setting, too. */ + otherwise the user will be left without a screen. */ mi = GET_HEAP(struct mode_info, 1); mi->mode = mode + VIDEO_FIRST_VESA; mi->depth = vminfo.bpp; @@ -133,10 +130,12 @@ static int vesa_set_mode(struct mode_info *mode) if ((vminfo.mode_attr & 0x15) == 0x05) { /* It's a supported text mode */ is_graphic = 0; +#ifdef CONFIG_FB_BOOT_VESA_SUPPORT } else if ((vminfo.mode_attr & 0x99) == 0x99) { /* It's a graphics mode with linear frame buffer */ is_graphic = 1; vesa_mode |= 0x4000; /* Request linear frame buffer */ +#endif } else { return -1; /* Invalid mode */ } diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig index 3b296e410f9..6210755cf6a 100644 --- a/drivers/video/Kconfig +++ b/drivers/video/Kconfig @@ -76,6 +76,14 @@ config FB_DDC select I2C default n +config FB_BOOT_VESA_SUPPORT + bool + depends on FB + default n + ---help--- + If true, at least one selected framebuffer driver can take advantage + of VESA video modes set at an early boot stage via the vga= parameter. + config FB_CFB_FILLRECT tristate depends on FB @@ -681,6 +689,7 @@ config FB_VESA select FB_CFB_FILLRECT select FB_CFB_COPYAREA select FB_CFB_IMAGEBLIT + select FB_BOOT_VESA_SUPPORT help This is the frame buffer device driver for generic VESA 2.0 compliant graphic cards. The older VESA 1.2 cards are not supported. @@ -1117,6 +1126,7 @@ config FB_INTEL select FB_CFB_FILLRECT select FB_CFB_COPYAREA select FB_CFB_IMAGEBLIT + select FB_BOOT_VESA_SUPPORT help This driver supports the on-board graphics built in to the Intel 830M/845G/852GM/855GM/865G/915G/915GM/945G/945GM/965G/965GM chipsets. @@ -1469,6 +1479,7 @@ config FB_SIS select FB_CFB_FILLRECT select FB_CFB_COPYAREA select FB_CFB_IMAGEBLIT + select FB_BOOT_VESA_SUPPORT help This is the frame buffer device driver for the SiS 300, 315, 330 and 340 series as well as XGI V3XT, V5, V8, Z7 graphics chipsets. -- cgit v1.2.3-70-g09d2 From 26adcfbf00e0726b4469070aa2f530dcf963f484 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 14 Oct 2008 21:01:15 +0200 Subject: x86: SB600: skip ACPI IRQ0 override if it is not routed to INT2 of IOAPIC On some more HP laptops BIOS reports an IRQ0 override but the SB600 chipset is configured such that timer interrupts go to INT0 of IOAPIC. Check IRQ0 routing and if it is routed to INT0 of IOAPIC skip the timer override. http://bugzilla.kernel.org/show_bug.cgi?id=11715 http://bugzilla.kernel.org/show_bug.cgi?id=11516 Signed-off-by: Andreas Herrmann Signed-off-by: Len Brown --- arch/x86/kernel/early-quirks.c | 55 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 733c4f8d42e..3ce029ffaa5 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -95,7 +95,8 @@ static void __init nvidia_bugs(int num, int slot, int func) } -static u32 ati_ixp4x0_rev(int num, int slot, int func) +#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) +static u32 __init ati_ixp4x0_rev(int num, int slot, int func) { u32 d; u8 b; @@ -115,7 +116,6 @@ static u32 ati_ixp4x0_rev(int num, int slot, int func) static void __init ati_bugs(int num, int slot, int func) { -#if defined(CONFIG_ACPI) && defined (CONFIG_X86_IO_APIC) u32 d; u8 b; @@ -138,9 +138,56 @@ static void __init ati_bugs(int num, int slot, int func) printk(KERN_INFO "If you got timer trouble " "try acpi_use_timer_override\n"); } -#endif } +static u32 __init ati_sbx00_rev(int num, int slot, int func) +{ + u32 old, d; + + d = read_pci_config(num, slot, func, 0x70); + old = d; + d &= ~(1<<8); + write_pci_config(num, slot, func, 0x70, d); + d = read_pci_config(num, slot, func, 0x8); + d &= 0xff; + write_pci_config(num, slot, func, 0x70, old); + + return d; +} + +static void __init ati_bugs_contd(int num, int slot, int func) +{ + u32 d, rev; + + if (acpi_use_timer_override) + return; + + rev = ati_sbx00_rev(num, slot, func); + if (rev > 0x13) + return; + + /* check for IRQ0 interrupt swap */ + d = read_pci_config(num, slot, func, 0x64); + if (!(d & (1<<14))) + acpi_skip_timer_override = 1; + + if (acpi_skip_timer_override) { + printk(KERN_INFO "SB600 revision 0x%x\n", rev); + printk(KERN_INFO "Ignoring ACPI timer override.\n"); + printk(KERN_INFO "If you got timer trouble " + "try acpi_use_timer_override\n"); + } +} +#else +static void __init ati_bugs(int num, int slot, int func) +{ +} + +static void __init ati_bugs_contd(int num, int slot, int func) +{ +} +#endif + #ifdef CONFIG_DMAR static void __init intel_g33_dmar(int num, int slot, int func) { @@ -176,6 +223,8 @@ static struct chipset early_qrk[] __initdata = { PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config }, { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS, PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs }, + { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, + PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd }, #ifdef CONFIG_DMAR { PCI_VENDOR_ID_INTEL, 0x29c0, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, intel_g33_dmar }, -- cgit v1.2.3-70-g09d2 From 89cedfefca1d446ee2598fd3bcbb23ee3802e26a Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Thu, 16 Oct 2008 19:00:08 -0400 Subject: cpuidle: upon BIOS bug, default to default_idle rather than polling http://bugzilla.kernel.org/show_bug.cgi?id=11345 Signed-off-by: Venkatesh Pallipadi Signed-off-by: Len Brown --- arch/x86/Kconfig | 3 +++ drivers/cpuidle/cpuidle.c | 4 ++++ 2 files changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ed92864d132..f8caf040650 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -123,6 +123,9 @@ config GENERIC_TIME_VSYSCALL config ARCH_HAS_CPU_RELAX def_bool y +config ARCH_HAS_DEFAULT_IDLE + def_bool y + config ARCH_HAS_CACHE_LINE_SIZE def_bool y diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index eb2cade562d..bb6e3b33804 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -56,7 +56,11 @@ static void cpuidle_idle_call(void) if (pm_idle_old) pm_idle_old(); else +#if defined(CONFIG_ARCH_HAS_DEFAULT_IDLE) + default_idle(); +#else local_irq_enable(); +#endif return; } -- cgit v1.2.3-70-g09d2 From 3038edabf48f01421c621cb77a712b446d3a5d67 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 17 Oct 2008 01:26:27 +0200 Subject: x86 ACPI: fix breakage of resume on 64-bit UP systems with SMP kernel x86 ACPI: Fix breakage of resume on 64-bit UP systems with SMP kernel We are now using per CPU GDT tables in head_64.S and the original early_gdt_descr.address is invalidated after boot by setup_per_cpu_areas(). This breaks resume from suspend to RAM on x86_64 UP systems using SMP kernels, because this part of head_64.S is also executed during the resume and the invalid GDT address causes the system to crash. It doesn't break on 'true' SMP systems, because early_gdt_descr.address is modified every time native_cpu_up() runs. However, during resume it should point to the GDT of the boot CPU rather than to another CPU's GDT. For this reason, during suspend to RAM always make early_gdt_descr.address point to the boot CPU's GDT. This fixes http://bugzilla.kernel.org/show_bug.cgi?id=11568, which is a regression from 2.6.26. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Cc: Signed-off-by: Ingo Molnar Reported-and-tested-by: Andy Wettstein --- arch/x86/kernel/acpi/sleep.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 426e5d91b63..c44cd6dbfa1 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "realmode/wakeup.h" #include "sleep.h" @@ -98,6 +99,8 @@ int acpi_save_state_mem(void) header->trampoline_segment = setup_trampoline() >> 4; #ifdef CONFIG_SMP stack_start.sp = temp_stack + 4096; + early_gdt_descr.address = + (unsigned long)get_cpu_gdt_table(smp_processor_id()); #endif initial_code = (unsigned long)wakeup_long64; saved_magic = 0x123456789abcdef0; -- cgit v1.2.3-70-g09d2 From d1d8c925b71dd6753bf438f9e14a9e5c5183bcc6 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Thu, 21 Aug 2008 12:53:33 -0700 Subject: Export kmap_atomic_pfn for DRM-GEM. The driver would like to map IO space directly for copying data in when appropriate, to avoid CPU cache flushing for streaming writes. kmap_atomic_pfn lets us avoid IPIs associated with ioremap for this process. Signed-off-by: Eric Anholt Signed-off-by: Dave Airlie --- arch/x86/mm/highmem_32.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 165c871ba9a..bcc079c282d 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -137,6 +137,7 @@ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) return (void*) vaddr; } +EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */ struct page *kmap_atomic_to_page(void *ptr) { -- cgit v1.2.3-70-g09d2 From 5b6985ce8ec7127b4d60ad450b64ca8b82748a3b Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Thu, 16 Oct 2008 18:02:32 -0700 Subject: intel-iommu: IA64 support The current Intel IOMMU code assumes that both host page size and Intel IOMMU page size are 4KiB. The first patch supports variable page size. This provides support for IA64 which has multiple page sizes. This patch also adds some other code hooks for IA64 platform including DMAR_OPERATION_TIMEOUT definition. [dwmw2: some cleanup] Signed-off-by: Fenghua Yu Signed-off-by: Tony Luck Signed-off-by: David Woodhouse --- arch/x86/kernel/pci-dma.c | 16 ------ drivers/pci/dmar.c | 19 ++++--- drivers/pci/intel-iommu.c | 128 ++++++++++++++++++++++-------------------- drivers/pci/quirks.c | 14 +++++ include/asm-x86/iommu.h | 4 ++ include/linux/dma_remapping.h | 27 +++++---- include/linux/intel-iommu.h | 39 +++++++------ 7 files changed, 131 insertions(+), 116 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 19262482021..1972266e8ba 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -9,8 +9,6 @@ #include #include -static int forbid_dac __read_mostly; - struct dma_mapping_ops *dma_ops; EXPORT_SYMBOL(dma_ops); @@ -293,17 +291,3 @@ void pci_iommu_shutdown(void) } /* Must execute after PCI subsystem */ fs_initcall(pci_iommu_init); - -#ifdef CONFIG_PCI -/* Many VIA bridges seem to corrupt data for DAC. Disable it here */ - -static __devinit void via_no_dac(struct pci_dev *dev) -{ - if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { - printk(KERN_INFO "PCI: VIA PCI bridge detected." - "Disabling DAC.\n"); - forbid_dac = 1; - } -} -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac); -#endif diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c index 44d6c7081b8..b65173828bc 100644 --- a/drivers/pci/dmar.c +++ b/drivers/pci/dmar.c @@ -277,14 +277,15 @@ dmar_table_print_dmar_entry(struct acpi_dmar_header *header) drhd = (struct acpi_dmar_hardware_unit *)header; printk (KERN_INFO PREFIX "DRHD (flags: 0x%08x)base: 0x%016Lx\n", - drhd->flags, drhd->address); + drhd->flags, (unsigned long long)drhd->address); break; case ACPI_DMAR_TYPE_RESERVED_MEMORY: rmrr = (struct acpi_dmar_reserved_memory *)header; printk (KERN_INFO PREFIX "RMRR base: 0x%016Lx end: 0x%016Lx\n", - rmrr->base_address, rmrr->end_address); + (unsigned long long)rmrr->base_address, + (unsigned long long)rmrr->end_address); break; } } @@ -304,7 +305,7 @@ parse_dmar_table(void) if (!dmar) return -ENODEV; - if (dmar->width < PAGE_SHIFT_4K - 1) { + if (dmar->width < PAGE_SHIFT - 1) { printk(KERN_WARNING PREFIX "Invalid DMAR haw\n"); return -EINVAL; } @@ -493,7 +494,7 @@ int alloc_iommu(struct dmar_drhd_unit *drhd) iommu->seq_id = iommu_allocated++; - iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K); + iommu->reg = ioremap(drhd->reg_base_addr, VTD_PAGE_SIZE); if (!iommu->reg) { printk(KERN_ERR "IOMMU: can't map the region\n"); goto error; @@ -504,8 +505,8 @@ int alloc_iommu(struct dmar_drhd_unit *drhd) /* the registers might be more than one page */ map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap), cap_max_fault_reg_offset(iommu->cap)); - map_size = PAGE_ALIGN_4K(map_size); - if (map_size > PAGE_SIZE_4K) { + map_size = VTD_PAGE_ALIGN(map_size); + if (map_size > VTD_PAGE_SIZE) { iounmap(iommu->reg); iommu->reg = ioremap(drhd->reg_base_addr, map_size); if (!iommu->reg) { @@ -516,8 +517,10 @@ int alloc_iommu(struct dmar_drhd_unit *drhd) ver = readl(iommu->reg + DMAR_VER_REG); pr_debug("IOMMU %llx: ver %d:%d cap %llx ecap %llx\n", - drhd->reg_base_addr, DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver), - iommu->cap, iommu->ecap); + (unsigned long long)drhd->reg_base_addr, + DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver), + (unsigned long long)iommu->cap, + (unsigned long long)iommu->ecap); spin_lock_init(&iommu->register_lock); diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 50947041913..2bf96babbc4 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -18,6 +18,7 @@ * Author: Ashok Raj * Author: Shaohua Li * Author: Anil S Keshavamurthy + * Author: Fenghua Yu */ #include @@ -35,11 +36,13 @@ #include #include #include -#include /* force_iommu in this header in x86-64*/ #include #include #include "pci.h" +#define ROOT_SIZE VTD_PAGE_SIZE +#define CONTEXT_SIZE VTD_PAGE_SIZE + #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) @@ -199,7 +202,7 @@ static struct context_entry * device_to_context_entry(struct intel_iommu *iommu, spin_unlock_irqrestore(&iommu->lock, flags); return NULL; } - __iommu_flush_cache(iommu, (void *)context, PAGE_SIZE_4K); + __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); phy_addr = virt_to_phys((void *)context); set_root_value(root, phy_addr); set_root_present(root); @@ -345,7 +348,7 @@ static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr) return NULL; } __iommu_flush_cache(domain->iommu, tmp_page, - PAGE_SIZE_4K); + PAGE_SIZE); dma_set_pte_addr(*pte, virt_to_phys(tmp_page)); /* * high level table always sets r/w, last level page @@ -408,13 +411,13 @@ static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end) start &= (((u64)1) << addr_width) - 1; end &= (((u64)1) << addr_width) - 1; /* in case it's partial page */ - start = PAGE_ALIGN_4K(start); - end &= PAGE_MASK_4K; + start = PAGE_ALIGN(start); + end &= PAGE_MASK; /* we don't need lock here, nobody else touches the iova range */ while (start < end) { dma_pte_clear_one(domain, start); - start += PAGE_SIZE_4K; + start += VTD_PAGE_SIZE; } } @@ -468,7 +471,7 @@ static int iommu_alloc_root_entry(struct intel_iommu *iommu) if (!root) return -ENOMEM; - __iommu_flush_cache(iommu, root, PAGE_SIZE_4K); + __iommu_flush_cache(iommu, root, ROOT_SIZE); spin_lock_irqsave(&iommu->lock, flags); iommu->root_entry = root; @@ -634,7 +637,8 @@ static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, printk(KERN_ERR"IOMMU: flush IOTLB failed\n"); if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n", - DMA_TLB_IIRG(type), DMA_TLB_IAIG(val)); + (unsigned long long)DMA_TLB_IIRG(type), + (unsigned long long)DMA_TLB_IAIG(val)); /* flush context entry will implictly flush write buffer */ return 0; } @@ -644,7 +648,7 @@ static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did, { unsigned int mask; - BUG_ON(addr & (~PAGE_MASK_4K)); + BUG_ON(addr & (~VTD_PAGE_MASK)); BUG_ON(pages == 0); /* Fallback to domain selective flush if no PSI support */ @@ -798,7 +802,7 @@ void dmar_msi_read(int irq, struct msi_msg *msg) } static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type, - u8 fault_reason, u16 source_id, u64 addr) + u8 fault_reason, u16 source_id, unsigned long long addr) { const char *reason; @@ -1051,9 +1055,9 @@ static void dmar_init_reserved_ranges(void) if (!r->flags || !(r->flags & IORESOURCE_MEM)) continue; addr = r->start; - addr &= PAGE_MASK_4K; + addr &= PAGE_MASK; size = r->end - addr; - size = PAGE_ALIGN_4K(size); + size = PAGE_ALIGN(size); iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr), IOVA_PFN(size + addr) - 1); if (!iova) @@ -1115,7 +1119,7 @@ static int domain_init(struct dmar_domain *domain, int guest_width) domain->pgd = (struct dma_pte *)alloc_pgtable_page(); if (!domain->pgd) return -ENOMEM; - __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE_4K); + __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE); return 0; } @@ -1131,7 +1135,7 @@ static void domain_exit(struct dmar_domain *domain) /* destroy iovas */ put_iova_domain(&domain->iovad); end = DOMAIN_MAX_ADDR(domain->gaw); - end = end & (~PAGE_MASK_4K); + end = end & (~PAGE_MASK); /* clear ptes */ dma_pte_clear_range(domain, 0, end); @@ -1252,22 +1256,25 @@ domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova, u64 start_pfn, end_pfn; struct dma_pte *pte; int index; + int addr_width = agaw_to_width(domain->agaw); + + hpa &= (((u64)1) << addr_width) - 1; if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) return -EINVAL; - iova &= PAGE_MASK_4K; - start_pfn = ((u64)hpa) >> PAGE_SHIFT_4K; - end_pfn = (PAGE_ALIGN_4K(((u64)hpa) + size)) >> PAGE_SHIFT_4K; + iova &= PAGE_MASK; + start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT; + end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT; index = 0; while (start_pfn < end_pfn) { - pte = addr_to_dma_pte(domain, iova + PAGE_SIZE_4K * index); + pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index); if (!pte) return -ENOMEM; /* We don't need lock here, nobody else * touches the iova range */ BUG_ON(dma_pte_addr(*pte)); - dma_set_pte_addr(*pte, start_pfn << PAGE_SHIFT_4K); + dma_set_pte_addr(*pte, start_pfn << VTD_PAGE_SHIFT); dma_set_pte_prot(*pte, prot); __iommu_flush_cache(domain->iommu, pte, sizeof(*pte)); start_pfn++; @@ -1445,11 +1452,13 @@ error: return find_domain(pdev); } -static int iommu_prepare_identity_map(struct pci_dev *pdev, u64 start, u64 end) +static int iommu_prepare_identity_map(struct pci_dev *pdev, + unsigned long long start, + unsigned long long end) { struct dmar_domain *domain; unsigned long size; - u64 base; + unsigned long long base; int ret; printk(KERN_INFO @@ -1461,9 +1470,9 @@ static int iommu_prepare_identity_map(struct pci_dev *pdev, u64 start, u64 end) return -ENOMEM; /* The address might not be aligned */ - base = start & PAGE_MASK_4K; + base = start & PAGE_MASK; size = end - base; - size = PAGE_ALIGN_4K(size); + size = PAGE_ALIGN(size); if (!reserve_iova(&domain->iovad, IOVA_PFN(base), IOVA_PFN(base + size) - 1)) { printk(KERN_ERR "IOMMU: reserve iova failed\n"); @@ -1732,8 +1741,8 @@ error: static inline u64 aligned_size(u64 host_addr, size_t size) { u64 addr; - addr = (host_addr & (~PAGE_MASK_4K)) + size; - return PAGE_ALIGN_4K(addr); + addr = (host_addr & (~PAGE_MASK)) + size; + return PAGE_ALIGN(addr); } struct iova * @@ -1747,7 +1756,7 @@ iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end) return NULL; piova = alloc_iova(&domain->iovad, - size >> PAGE_SHIFT_4K, IOVA_PFN(end), 1); + size >> PAGE_SHIFT, IOVA_PFN(end), 1); return piova; } @@ -1807,12 +1816,12 @@ get_valid_domain_for_dev(struct pci_dev *pdev) return domain; } -static dma_addr_t +dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr, size_t size, int dir) { struct pci_dev *pdev = to_pci_dev(hwdev); struct dmar_domain *domain; - unsigned long start_paddr; + phys_addr_t start_paddr; struct iova *iova; int prot = 0; int ret; @@ -1831,7 +1840,7 @@ intel_map_single(struct device *hwdev, phys_addr_t paddr, size_t size, int dir) if (!iova) goto error; - start_paddr = iova->pfn_lo << PAGE_SHIFT_4K; + start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT; /* * Check if DMAR supports zero-length reads on write only @@ -1849,27 +1858,23 @@ intel_map_single(struct device *hwdev, phys_addr_t paddr, size_t size, int dir) * is not a big problem */ ret = domain_page_mapping(domain, start_paddr, - ((u64)paddr) & PAGE_MASK_4K, size, prot); + ((u64)paddr) & PAGE_MASK, size, prot); if (ret) goto error; - pr_debug("Device %s request: %lx@%llx mapping: %lx@%llx, dir %d\n", - pci_name(pdev), size, (u64)paddr, - size, (u64)start_paddr, dir); - /* it's a non-present to present mapping */ ret = iommu_flush_iotlb_psi(domain->iommu, domain->id, - start_paddr, size >> PAGE_SHIFT_4K, 1); + start_paddr, size >> VTD_PAGE_SHIFT, 1); if (ret) iommu_flush_write_buffer(domain->iommu); - return (start_paddr + ((u64)paddr & (~PAGE_MASK_4K))); + return start_paddr + ((u64)paddr & (~PAGE_MASK)); error: if (iova) __free_iova(&domain->iovad, iova); printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n", - pci_name(pdev), size, (u64)paddr, dir); + pci_name(pdev), size, (unsigned long long)paddr, dir); return 0; } @@ -1931,8 +1936,8 @@ static void add_unmap(struct dmar_domain *dom, struct iova *iova) spin_unlock_irqrestore(&async_umap_flush_lock, flags); } -static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, - size_t size, int dir) +void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size, + int dir) { struct pci_dev *pdev = to_pci_dev(dev); struct dmar_domain *domain; @@ -1948,11 +1953,11 @@ static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, if (!iova) return; - start_addr = iova->pfn_lo << PAGE_SHIFT_4K; + start_addr = iova->pfn_lo << PAGE_SHIFT; size = aligned_size((u64)dev_addr, size); pr_debug("Device %s unmapping: %lx@%llx\n", - pci_name(pdev), size, (u64)start_addr); + pci_name(pdev), size, (unsigned long long)start_addr); /* clear the whole page */ dma_pte_clear_range(domain, start_addr, start_addr + size); @@ -1960,7 +1965,7 @@ static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, dma_pte_free_pagetable(domain, start_addr, start_addr + size); if (intel_iommu_strict) { if (iommu_flush_iotlb_psi(domain->iommu, - domain->id, start_addr, size >> PAGE_SHIFT_4K, 0)) + domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0)) iommu_flush_write_buffer(domain->iommu); /* free iova */ __free_iova(&domain->iovad, iova); @@ -1973,13 +1978,13 @@ static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, } } -static void * intel_alloc_coherent(struct device *hwdev, size_t size, - dma_addr_t *dma_handle, gfp_t flags) +void *intel_alloc_coherent(struct device *hwdev, size_t size, + dma_addr_t *dma_handle, gfp_t flags) { void *vaddr; int order; - size = PAGE_ALIGN_4K(size); + size = PAGE_ALIGN(size); order = get_order(size); flags &= ~(GFP_DMA | GFP_DMA32); @@ -1995,12 +2000,12 @@ static void * intel_alloc_coherent(struct device *hwdev, size_t size, return NULL; } -static void intel_free_coherent(struct device *hwdev, size_t size, - void *vaddr, dma_addr_t dma_handle) +void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr, + dma_addr_t dma_handle) { int order; - size = PAGE_ALIGN_4K(size); + size = PAGE_ALIGN(size); order = get_order(size); intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL); @@ -2008,8 +2013,9 @@ static void intel_free_coherent(struct device *hwdev, size_t size, } #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg))) -static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist, - int nelems, int dir) + +void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist, + int nelems, int dir) { int i; struct pci_dev *pdev = to_pci_dev(hwdev); @@ -2033,7 +2039,7 @@ static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist, size += aligned_size((u64)addr, sg->length); } - start_addr = iova->pfn_lo << PAGE_SHIFT_4K; + start_addr = iova->pfn_lo << PAGE_SHIFT; /* clear the whole page */ dma_pte_clear_range(domain, start_addr, start_addr + size); @@ -2041,7 +2047,7 @@ static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist, dma_pte_free_pagetable(domain, start_addr, start_addr + size); if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr, - size >> PAGE_SHIFT_4K, 0)) + size >> VTD_PAGE_SHIFT, 0)) iommu_flush_write_buffer(domain->iommu); /* free iova */ @@ -2062,8 +2068,8 @@ static int intel_nontranslate_map_sg(struct device *hddev, return nelems; } -static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, - int nelems, int dir) +int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems, + int dir) { void *addr; int i; @@ -2107,14 +2113,14 @@ static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) prot |= DMA_PTE_WRITE; - start_addr = iova->pfn_lo << PAGE_SHIFT_4K; + start_addr = iova->pfn_lo << PAGE_SHIFT; offset = 0; for_each_sg(sglist, sg, nelems, i) { addr = SG_ENT_VIRT_ADDRESS(sg); addr = (void *)virt_to_phys(addr); size = aligned_size((u64)addr, sg->length); ret = domain_page_mapping(domain, start_addr + offset, - ((u64)addr) & PAGE_MASK_4K, + ((u64)addr) & PAGE_MASK, size, prot); if (ret) { /* clear the page */ @@ -2128,14 +2134,14 @@ static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, return 0; } sg->dma_address = start_addr + offset + - ((u64)addr & (~PAGE_MASK_4K)); + ((u64)addr & (~PAGE_MASK)); sg->dma_length = sg->length; offset += size; } /* it's a non-present to present mapping */ if (iommu_flush_iotlb_psi(domain->iommu, domain->id, - start_addr, offset >> PAGE_SHIFT_4K, 1)) + start_addr, offset >> VTD_PAGE_SHIFT, 1)) iommu_flush_write_buffer(domain->iommu); return nelems; } @@ -2175,7 +2181,6 @@ static inline int iommu_devinfo_cache_init(void) sizeof(struct device_domain_info), 0, SLAB_HWCACHE_ALIGN, - NULL); if (!iommu_devinfo_cache) { printk(KERN_ERR "Couldn't create devinfo cache\n"); @@ -2193,7 +2198,6 @@ static inline int iommu_iova_cache_init(void) sizeof(struct iova), 0, SLAB_HWCACHE_ALIGN, - NULL); if (!iommu_iova_cache) { printk(KERN_ERR "Couldn't create iova cache\n"); @@ -2322,7 +2326,7 @@ void intel_iommu_domain_exit(struct dmar_domain *domain) return; end = DOMAIN_MAX_ADDR(domain->gaw); - end = end & (~PAGE_MASK_4K); + end = end & (~VTD_PAGE_MASK); /* clear ptes */ dma_pte_clear_range(domain, 0, end); @@ -2418,6 +2422,6 @@ u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova) if (pte) pfn = dma_pte_addr(*pte); - return pfn >> PAGE_SHIFT_4K; + return pfn >> VTD_PAGE_SHIFT; } EXPORT_SYMBOL_GPL(intel_iommu_iova_to_pfn); diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index e872ac925b4..832175d9ca2 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -35,6 +35,20 @@ static void __devinit quirk_mellanox_tavor(struct pci_dev *dev) DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_MELLANOX,PCI_DEVICE_ID_MELLANOX_TAVOR,quirk_mellanox_tavor); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_MELLANOX,PCI_DEVICE_ID_MELLANOX_TAVOR_BRIDGE,quirk_mellanox_tavor); +/* Many VIA bridges seem to corrupt data for DAC. Disable it here */ +int forbid_dac __read_mostly; +EXPORT_SYMBOL(forbid_dac); + +static __devinit void via_no_dac(struct pci_dev *dev) +{ + if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { + dev_info(&dev->dev, + "VIA PCI bridge detected. Disabling DAC.\n"); + forbid_dac = 1; + } +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac); + /* Deal with broken BIOS'es that neglect to enable passive release, which can cause problems in combination with the 82441FX/PPro MTRRs */ static void quirk_passive_release(struct pci_dev *dev) diff --git a/include/asm-x86/iommu.h b/include/asm-x86/iommu.h index 961e746da97..2daaffcda52 100644 --- a/include/asm-x86/iommu.h +++ b/include/asm-x86/iommu.h @@ -7,9 +7,13 @@ extern struct dma_mapping_ops nommu_dma_ops; extern int force_iommu, no_iommu; extern int iommu_detected; extern int dmar_disabled; +extern int forbid_dac; extern unsigned long iommu_nr_pages(unsigned long addr, unsigned long len); +/* 10 seconds */ +#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) + #ifdef CONFIG_GART_IOMMU extern int gart_iommu_aperture; extern int gart_iommu_aperture_allowed; diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h index bff5c65f81d..952df39c989 100644 --- a/include/linux/dma_remapping.h +++ b/include/linux/dma_remapping.h @@ -2,15 +2,14 @@ #define _DMA_REMAPPING_H /* - * We need a fixed PAGE_SIZE of 4K irrespective of - * arch PAGE_SIZE for IOMMU page tables. + * VT-d hardware uses 4KiB page size regardless of host page size. */ -#define PAGE_SHIFT_4K (12) -#define PAGE_SIZE_4K (1UL << PAGE_SHIFT_4K) -#define PAGE_MASK_4K (((u64)-1) << PAGE_SHIFT_4K) -#define PAGE_ALIGN_4K(addr) (((addr) + PAGE_SIZE_4K - 1) & PAGE_MASK_4K) +#define VTD_PAGE_SHIFT (12) +#define VTD_PAGE_SIZE (1UL << VTD_PAGE_SHIFT) +#define VTD_PAGE_MASK (((u64)-1) << VTD_PAGE_SHIFT) +#define VTD_PAGE_ALIGN(addr) (((addr) + VTD_PAGE_SIZE - 1) & VTD_PAGE_MASK) -#define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT_4K) +#define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) #define DMA_32BIT_PFN IOVA_PFN(DMA_32BIT_MASK) #define DMA_64BIT_PFN IOVA_PFN(DMA_64BIT_MASK) @@ -25,7 +24,7 @@ struct root_entry { u64 val; u64 rsvd1; }; -#define ROOT_ENTRY_NR (PAGE_SIZE_4K/sizeof(struct root_entry)) +#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) static inline bool root_present(struct root_entry *root) { return (root->val & 1); @@ -36,7 +35,7 @@ static inline void set_root_present(struct root_entry *root) } static inline void set_root_value(struct root_entry *root, unsigned long value) { - root->val |= value & PAGE_MASK_4K; + root->val |= value & VTD_PAGE_MASK; } struct context_entry; @@ -45,7 +44,7 @@ get_context_addr_from_root(struct root_entry *root) { return (struct context_entry *) (root_present(root)?phys_to_virt( - root->val & PAGE_MASK_4K): + root->val & VTD_PAGE_MASK) : NULL); } @@ -67,7 +66,7 @@ struct context_entry { #define context_present(c) ((c).lo & 1) #define context_fault_disable(c) (((c).lo >> 1) & 1) #define context_translation_type(c) (((c).lo >> 2) & 3) -#define context_address_root(c) ((c).lo & PAGE_MASK_4K) +#define context_address_root(c) ((c).lo & VTD_PAGE_MASK) #define context_address_width(c) ((c).hi & 7) #define context_domain_id(c) (((c).hi >> 8) & ((1 << 16) - 1)) @@ -81,7 +80,7 @@ struct context_entry { } while (0) #define CONTEXT_TT_MULTI_LEVEL 0 #define context_set_address_root(c, val) \ - do {(c).lo |= (val) & PAGE_MASK_4K;} while (0) + do {(c).lo |= (val) & VTD_PAGE_MASK; } while (0) #define context_set_address_width(c, val) do {(c).hi |= (val) & 7;} while (0) #define context_set_domain_id(c, val) \ do {(c).hi |= ((val) & ((1 << 16) - 1)) << 8;} while (0) @@ -107,9 +106,9 @@ struct dma_pte { #define dma_set_pte_writable(p) do {(p).val |= DMA_PTE_WRITE;} while (0) #define dma_set_pte_prot(p, prot) \ do {(p).val = ((p).val & ~3) | ((prot) & 3); } while (0) -#define dma_pte_addr(p) ((p).val & PAGE_MASK_4K) +#define dma_pte_addr(p) ((p).val & VTD_PAGE_MASK) #define dma_set_pte_addr(p, addr) do {\ - (p).val |= ((addr) & PAGE_MASK_4K); } while (0) + (p).val |= ((addr) & VTD_PAGE_MASK); } while (0) #define dma_pte_present(p) (((p).val & 3) != 0) struct intel_iommu; diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index afb0d2a5b7c..3d017cfd245 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -29,6 +29,7 @@ #include #include #include +#include /* * Intel IOMMU register specification per version 1.0 public spec. @@ -202,22 +203,21 @@ static inline void dmar_writeq(void __iomem *addr, u64 val) #define dma_frcd_type(d) ((d >> 30) & 1) #define dma_frcd_fault_reason(c) (c & 0xff) #define dma_frcd_source_id(c) (c & 0xffff) -#define dma_frcd_page_addr(d) (d & (((u64)-1) << 12)) /* low 64 bit */ - -#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) /* 10sec */ - -#define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \ -{\ - cycles_t start_time = get_cycles();\ - while (1) {\ - sts = op (iommu->reg + offset);\ - if (cond)\ - break;\ +/* low 64 bit */ +#define dma_frcd_page_addr(d) (d & (((u64)-1) << PAGE_SHIFT)) + +#define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \ +do { \ + cycles_t start_time = get_cycles(); \ + while (1) { \ + sts = op(iommu->reg + offset); \ + if (cond) \ + break; \ if (DMAR_OPERATION_TIMEOUT < (get_cycles() - start_time))\ - panic("DMAR hardware is malfunctioning\n");\ - cpu_relax();\ - }\ -} + panic("DMAR hardware is malfunctioning\n"); \ + cpu_relax(); \ + } \ +} while (0) #define QI_LENGTH 256 /* queue length */ @@ -244,7 +244,7 @@ enum { #define QI_IOTLB_DR(dr) (((u64)dr) << 7) #define QI_IOTLB_DW(dw) (((u64)dw) << 6) #define QI_IOTLB_GRAN(gran) (((u64)gran) >> (DMA_TLB_FLUSH_GRANU_OFFSET-4)) -#define QI_IOTLB_ADDR(addr) (((u64)addr) & PAGE_MASK_4K) +#define QI_IOTLB_ADDR(addr) (((u64)addr) & VTD_PAGE_MASK) #define QI_IOTLB_IH(ih) (((u64)ih) << 6) #define QI_IOTLB_AM(am) (((u8)am)) @@ -353,4 +353,11 @@ static inline int intel_iommu_found(void) } #endif /* CONFIG_DMAR */ +extern void *intel_alloc_coherent(struct device *, size_t, dma_addr_t *, gfp_t); +extern void intel_free_coherent(struct device *, size_t, void *, dma_addr_t); +extern dma_addr_t intel_map_single(struct device *, phys_addr_t, size_t, int); +extern void intel_unmap_single(struct device *, dma_addr_t, size_t, int); +extern int intel_map_sg(struct device *, struct scatterlist *, int, int); +extern void intel_unmap_sg(struct device *, struct scatterlist *, int, int); + #endif -- cgit v1.2.3-70-g09d2 From f609891f428e1c20e270e7c350daf8c93cc459d7 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Thu, 16 Oct 2008 16:27:36 +0200 Subject: amd_iommu: fix nasty bug that caused ILLEGAL_DEVICE_TABLE_ENTRY errors We are on 64-bit so better use u64 instead of u32 to deal with addresses: static void __init iommu_set_device_table(struct amd_iommu *iommu) { u64 entry; ... entry = virt_to_phys(amd_iommu_dev_table); ... (I am wondering why gcc 4.2.x did not warn about the assignment between u32 and unsigned long.) Cc: iommu@lists.linux-foundation.org Cc: stable@kernel.org Signed-off-by: Andreas Herrmann Signed-off-by: Joerg Roedel Signed-off-by: David Woodhouse --- arch/x86/kernel/amd_iommu_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 4cd8083c58b..0cdcda35a05 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -212,7 +212,7 @@ static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) /* Programs the physical address of the device table into the IOMMU hardware */ static void __init iommu_set_device_table(struct amd_iommu *iommu) { - u32 entry; + u64 entry; BUG_ON(iommu->mmio_base == NULL); -- cgit v1.2.3-70-g09d2 From db64fe02258f1507e13fe5212a989922323685ce Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Sat, 18 Oct 2008 20:27:03 -0700 Subject: mm: rewrite vmap layer Rewrite the vmap allocator to use rbtrees and lazy tlb flushing, and provide a fast, scalable percpu frontend for small vmaps (requires a slightly different API, though). The biggest problem with vmap is actually vunmap. Presently this requires a global kernel TLB flush, which on most architectures is a broadcast IPI to all CPUs to flush the cache. This is all done under a global lock. As the number of CPUs increases, so will the number of vunmaps a scaled workload will want to perform, and so will the cost of a global TLB flush. This gives terrible quadratic scalability characteristics. Another problem is that the entire vmap subsystem works under a single lock. It is a rwlock, but it is actually taken for write in all the fast paths, and the read locking would likely never be run concurrently anyway, so it's just pointless. This is a rewrite of vmap subsystem to solve those problems. The existing vmalloc API is implemented on top of the rewritten subsystem. The TLB flushing problem is solved by using lazy TLB unmapping. vmap addresses do not have to be flushed immediately when they are vunmapped, because the kernel will not reuse them again (would be a use-after-free) until they are reallocated. So the addresses aren't allocated again until a subsequent TLB flush. A single TLB flush then can flush multiple vunmaps from each CPU. XEN and PAT and such do not like deferred TLB flushing because they can't always handle multiple aliasing virtual addresses to a physical address. They now call vm_unmap_aliases() in order to flush any deferred mappings. That call is very expensive (well, actually not a lot more expensive than a single vunmap under the old scheme), however it should be OK if not called too often. The virtual memory extent information is stored in an rbtree rather than a linked list to improve the algorithmic scalability. There is a per-CPU allocator for small vmaps, which amortizes or avoids global locking. To use the per-CPU interface, the vm_map_ram / vm_unmap_ram interfaces must be used in place of vmap and vunmap. Vmalloc does not use these interfaces at the moment, so it will not be quite so scalable (although it will use lazy TLB flushing). As a quick test of performance, I ran a test that loops in the kernel, linearly mapping then touching then unmapping 4 pages. Different numbers of tests were run in parallel on an 4 core, 2 socket opteron. Results are in nanoseconds per map+touch+unmap. threads vanilla vmap rewrite 1 14700 2900 2 33600 3000 4 49500 2800 8 70631 2900 So with a 8 cores, the rewritten version is already 25x faster. In a slightly more realistic test (although with an older and less scalable version of the patch), I ripped the not-very-good vunmap batching code out of XFS, and implemented the large buffer mapping with vm_map_ram and vm_unmap_ram... along with a couple of other tricks, I was able to speed up a large directory workload by 20x on a 64 CPU system. I believe vmap/vunmap is actually sped up a lot more than 20x on such a system, but I'm running into other locks now. vmap is pretty well blown off the profiles. Before: 1352059 total 0.1401 798784 _write_lock 8320.6667 <- vmlist_lock 529313 default_idle 1181.5022 15242 smp_call_function 15.8771 <- vmap tlb flushing 2472 __get_vm_area_node 1.9312 <- vmap 1762 remove_vm_area 4.5885 <- vunmap 316 map_vm_area 0.2297 <- vmap 312 kfree 0.1950 300 _spin_lock 3.1250 252 sn_send_IPI_phys 0.4375 <- tlb flushing 238 vmap 0.8264 <- vmap 216 find_lock_page 0.5192 196 find_next_bit 0.3603 136 sn2_send_IPI 0.2024 130 pio_phys_write_mmr 2.0312 118 unmap_kernel_range 0.1229 After: 78406 total 0.0081 40053 default_idle 89.4040 33576 ia64_spinlock_contention 349.7500 1650 _spin_lock 17.1875 319 __reg_op 0.5538 281 _atomic_dec_and_lock 1.0977 153 mutex_unlock 1.5938 123 iget_locked 0.1671 117 xfs_dir_lookup 0.1662 117 dput 0.1406 114 xfs_iget_core 0.0268 92 xfs_da_hashname 0.1917 75 d_alloc 0.0670 68 vmap_page_range 0.0462 <- vmap 58 kmem_cache_alloc 0.0604 57 memset 0.0540 52 rb_next 0.1625 50 __copy_user 0.0208 49 bitmap_find_free_region 0.2188 <- vmap 46 ia64_sn_udelay 0.1106 45 find_inode_fast 0.1406 42 memcmp 0.2188 42 finish_task_switch 0.1094 42 __d_lookup 0.0410 40 radix_tree_lookup_slot 0.1250 37 _spin_unlock_irqrestore 0.3854 36 xfs_bmapi 0.0050 36 kmem_cache_free 0.0256 35 xfs_vn_getattr 0.0322 34 radix_tree_lookup 0.1062 33 __link_path_walk 0.0035 31 xfs_da_do_buf 0.0091 30 _xfs_buf_find 0.0204 28 find_get_page 0.0875 27 xfs_iread 0.0241 27 __strncpy_from_user 0.2812 26 _xfs_buf_initialize 0.0406 24 _xfs_buf_lookup_pages 0.0179 24 vunmap_page_range 0.0250 <- vunmap 23 find_lock_page 0.0799 22 vm_map_ram 0.0087 <- vmap 20 kfree 0.0125 19 put_page 0.0330 18 __kmalloc 0.0176 17 xfs_da_node_lookup_int 0.0086 17 _read_lock 0.0885 17 page_waitqueue 0.0664 vmap has gone from being the top 5 on the profiles and flushing the crap out of all TLBs, to using less than 1% of kernel time. [akpm@linux-foundation.org: cleanups, section fix] [akpm@linux-foundation.org: fix build on alpha] Signed-off-by: Nick Piggin Cc: Jeremy Fitzhardinge Cc: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/pageattr.c | 2 + arch/x86/xen/enlighten.c | 1 + arch/x86/xen/mmu.c | 1 + include/linux/vmalloc.h | 15 +- init/main.c | 2 + mm/vmalloc.c | 975 ++++++++++++++++++++++++++++++++++++++++------- 6 files changed, 862 insertions(+), 134 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index a9ec89c3fbc..407d8784f66 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -792,6 +792,8 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, /* Must avoid aliasing mappings in the highmem code */ kmap_flush_unused(); + vm_unmap_aliases(); + cpa.vaddr = addr; cpa.numpages = numpages; cpa.mask_set = mask_set; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 0013a729b41..b61534c7a4c 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -871,6 +871,7 @@ static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned l /* make sure there are no stray mappings of this page */ kmap_flush_unused(); + vm_unmap_aliases(); } } diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index ae173f6edd8..d4d52f5a1cf 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -846,6 +846,7 @@ static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) /* re-enable interrupts for kmap_flush_unused */ xen_mc_issue(0); kmap_flush_unused(); + vm_unmap_aliases(); xen_mc_batch(); } diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 328eb402272..4c28c4d564e 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -2,6 +2,7 @@ #define _LINUX_VMALLOC_H #include +#include #include /* pgprot_t */ struct vm_area_struct; /* vma defining user mapping in mm_types.h */ @@ -23,7 +24,6 @@ struct vm_area_struct; /* vma defining user mapping in mm_types.h */ #endif struct vm_struct { - /* keep next,addr,size together to speedup lookups */ struct vm_struct *next; void *addr; unsigned long size; @@ -37,6 +37,19 @@ struct vm_struct { /* * Highlevel APIs for driver use */ +extern void vm_unmap_ram(const void *mem, unsigned int count); +extern void *vm_map_ram(struct page **pages, unsigned int count, + int node, pgprot_t prot); +extern void vm_unmap_aliases(void); + +#ifdef CONFIG_MMU +extern void __init vmalloc_init(void); +#else +static inline void vmalloc_init(void) +{ +} +#endif + extern void *vmalloc(unsigned long size); extern void *vmalloc_user(unsigned long size); extern void *vmalloc_node(unsigned long size, int node); diff --git a/init/main.c b/init/main.c index 27f6bf6108e..4371d11721f 100644 --- a/init/main.c +++ b/init/main.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -642,6 +643,7 @@ asmlinkage void __init start_kernel(void) initrd_start = 0; } #endif + vmalloc_init(); vfs_caches_init_early(); cpuset_init_early(); mem_init(); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index bba06c41fc5..712ae47af0b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -8,6 +8,7 @@ * Numa awareness, Christoph Lameter, SGI, June 2005 */ +#include #include #include #include @@ -18,16 +19,17 @@ #include #include #include +#include +#include +#include +#include +#include #include #include -DEFINE_RWLOCK(vmlist_lock); -struct vm_struct *vmlist; - -static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, - int node, void *caller); +/*** Page table manipulation functions ***/ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) { @@ -40,8 +42,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) } while (pte++, addr += PAGE_SIZE, addr != end); } -static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr, - unsigned long end) +static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end) { pmd_t *pmd; unsigned long next; @@ -55,8 +56,7 @@ static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr, } while (pmd++, addr = next, addr != end); } -static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr, - unsigned long end) +static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end) { pud_t *pud; unsigned long next; @@ -70,12 +70,10 @@ static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr, } while (pud++, addr = next, addr != end); } -void unmap_kernel_range(unsigned long addr, unsigned long size) +static void vunmap_page_range(unsigned long addr, unsigned long end) { pgd_t *pgd; unsigned long next; - unsigned long start = addr; - unsigned long end = addr + size; BUG_ON(addr >= end); pgd = pgd_offset_k(addr); @@ -86,35 +84,36 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) continue; vunmap_pud_range(pgd, addr, next); } while (pgd++, addr = next, addr != end); - flush_tlb_kernel_range(start, end); -} - -static void unmap_vm_area(struct vm_struct *area) -{ - unmap_kernel_range((unsigned long)area->addr, area->size); } static int vmap_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, pgprot_t prot, struct page ***pages) + unsigned long end, pgprot_t prot, struct page **pages, int *nr) { pte_t *pte; + /* + * nr is a running index into the array which helps higher level + * callers keep track of where we're up to. + */ + pte = pte_alloc_kernel(pmd, addr); if (!pte) return -ENOMEM; do { - struct page *page = **pages; - WARN_ON(!pte_none(*pte)); - if (!page) + struct page *page = pages[*nr]; + + if (WARN_ON(!pte_none(*pte))) + return -EBUSY; + if (WARN_ON(!page)) return -ENOMEM; set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); - (*pages)++; + (*nr)++; } while (pte++, addr += PAGE_SIZE, addr != end); return 0; } -static inline int vmap_pmd_range(pud_t *pud, unsigned long addr, - unsigned long end, pgprot_t prot, struct page ***pages) +static int vmap_pmd_range(pud_t *pud, unsigned long addr, + unsigned long end, pgprot_t prot, struct page **pages, int *nr) { pmd_t *pmd; unsigned long next; @@ -124,14 +123,14 @@ static inline int vmap_pmd_range(pud_t *pud, unsigned long addr, return -ENOMEM; do { next = pmd_addr_end(addr, end); - if (vmap_pte_range(pmd, addr, next, prot, pages)) + if (vmap_pte_range(pmd, addr, next, prot, pages, nr)) return -ENOMEM; } while (pmd++, addr = next, addr != end); return 0; } -static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr, - unsigned long end, pgprot_t prot, struct page ***pages) +static int vmap_pud_range(pgd_t *pgd, unsigned long addr, + unsigned long end, pgprot_t prot, struct page **pages, int *nr) { pud_t *pud; unsigned long next; @@ -141,44 +140,49 @@ static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr, return -ENOMEM; do { next = pud_addr_end(addr, end); - if (vmap_pmd_range(pud, addr, next, prot, pages)) + if (vmap_pmd_range(pud, addr, next, prot, pages, nr)) return -ENOMEM; } while (pud++, addr = next, addr != end); return 0; } -int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) +/* + * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and + * will have pfns corresponding to the "pages" array. + * + * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] + */ +static int vmap_page_range(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages) { pgd_t *pgd; unsigned long next; - unsigned long addr = (unsigned long) area->addr; - unsigned long end = addr + area->size - PAGE_SIZE; - int err; + int err = 0; + int nr = 0; BUG_ON(addr >= end); pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); - err = vmap_pud_range(pgd, addr, next, prot, pages); + err = vmap_pud_range(pgd, addr, next, prot, pages, &nr); if (err) break; } while (pgd++, addr = next, addr != end); - flush_cache_vmap((unsigned long) area->addr, end); - return err; + flush_cache_vmap(addr, end); + + if (unlikely(err)) + return err; + return nr; } -EXPORT_SYMBOL_GPL(map_vm_area); /* - * Map a vmalloc()-space virtual address to the physical page. + * Walk a vmap address to the struct page it maps. */ struct page *vmalloc_to_page(const void *vmalloc_addr) { unsigned long addr = (unsigned long) vmalloc_addr; struct page *page = NULL; pgd_t *pgd = pgd_offset_k(addr); - pud_t *pud; - pmd_t *pmd; - pte_t *ptep, pte; /* * XXX we might need to change this if we add VIRTUAL_BUG_ON for @@ -188,10 +192,12 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) !is_module_address(addr)); if (!pgd_none(*pgd)) { - pud = pud_offset(pgd, addr); + pud_t *pud = pud_offset(pgd, addr); if (!pud_none(*pud)) { - pmd = pmd_offset(pud, addr); + pmd_t *pmd = pmd_offset(pud, addr); if (!pmd_none(*pmd)) { + pte_t *ptep, pte; + ptep = pte_offset_map(pmd, addr); pte = *ptep; if (pte_present(pte)) @@ -213,13 +219,751 @@ unsigned long vmalloc_to_pfn(const void *vmalloc_addr) } EXPORT_SYMBOL(vmalloc_to_pfn); -static struct vm_struct * -__get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, - unsigned long end, int node, gfp_t gfp_mask, void *caller) + +/*** Global kva allocator ***/ + +#define VM_LAZY_FREE 0x01 +#define VM_LAZY_FREEING 0x02 +#define VM_VM_AREA 0x04 + +struct vmap_area { + unsigned long va_start; + unsigned long va_end; + unsigned long flags; + struct rb_node rb_node; /* address sorted rbtree */ + struct list_head list; /* address sorted list */ + struct list_head purge_list; /* "lazy purge" list */ + void *private; + struct rcu_head rcu_head; +}; + +static DEFINE_SPINLOCK(vmap_area_lock); +static struct rb_root vmap_area_root = RB_ROOT; +static LIST_HEAD(vmap_area_list); + +static struct vmap_area *__find_vmap_area(unsigned long addr) { - struct vm_struct **p, *tmp, *area; - unsigned long align = 1; + struct rb_node *n = vmap_area_root.rb_node; + + while (n) { + struct vmap_area *va; + + va = rb_entry(n, struct vmap_area, rb_node); + if (addr < va->va_start) + n = n->rb_left; + else if (addr > va->va_start) + n = n->rb_right; + else + return va; + } + + return NULL; +} + +static void __insert_vmap_area(struct vmap_area *va) +{ + struct rb_node **p = &vmap_area_root.rb_node; + struct rb_node *parent = NULL; + struct rb_node *tmp; + + while (*p) { + struct vmap_area *tmp; + + parent = *p; + tmp = rb_entry(parent, struct vmap_area, rb_node); + if (va->va_start < tmp->va_end) + p = &(*p)->rb_left; + else if (va->va_end > tmp->va_start) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&va->rb_node, parent, p); + rb_insert_color(&va->rb_node, &vmap_area_root); + + /* address-sort this list so it is usable like the vmlist */ + tmp = rb_prev(&va->rb_node); + if (tmp) { + struct vmap_area *prev; + prev = rb_entry(tmp, struct vmap_area, rb_node); + list_add_rcu(&va->list, &prev->list); + } else + list_add_rcu(&va->list, &vmap_area_list); +} + +static void purge_vmap_area_lazy(void); + +/* + * Allocate a region of KVA of the specified size and alignment, within the + * vstart and vend. + */ +static struct vmap_area *alloc_vmap_area(unsigned long size, + unsigned long align, + unsigned long vstart, unsigned long vend, + int node, gfp_t gfp_mask) +{ + struct vmap_area *va; + struct rb_node *n; + unsigned long addr; + int purged = 0; + + BUG_ON(size & ~PAGE_MASK); + + addr = ALIGN(vstart, align); + + va = kmalloc_node(sizeof(struct vmap_area), + gfp_mask & GFP_RECLAIM_MASK, node); + if (unlikely(!va)) + return ERR_PTR(-ENOMEM); + +retry: + spin_lock(&vmap_area_lock); + /* XXX: could have a last_hole cache */ + n = vmap_area_root.rb_node; + if (n) { + struct vmap_area *first = NULL; + + do { + struct vmap_area *tmp; + tmp = rb_entry(n, struct vmap_area, rb_node); + if (tmp->va_end >= addr) { + if (!first && tmp->va_start < addr + size) + first = tmp; + n = n->rb_left; + } else { + first = tmp; + n = n->rb_right; + } + } while (n); + + if (!first) + goto found; + + if (first->va_end < addr) { + n = rb_next(&first->rb_node); + if (n) + first = rb_entry(n, struct vmap_area, rb_node); + else + goto found; + } + + while (addr + size >= first->va_start && addr + size <= vend) { + addr = ALIGN(first->va_end + PAGE_SIZE, align); + + n = rb_next(&first->rb_node); + if (n) + first = rb_entry(n, struct vmap_area, rb_node); + else + goto found; + } + } +found: + if (addr + size > vend) { + spin_unlock(&vmap_area_lock); + if (!purged) { + purge_vmap_area_lazy(); + purged = 1; + goto retry; + } + if (printk_ratelimit()) + printk(KERN_WARNING "vmap allocation failed: " + "use vmalloc= to increase size.\n"); + return ERR_PTR(-EBUSY); + } + + BUG_ON(addr & (align-1)); + + va->va_start = addr; + va->va_end = addr + size; + va->flags = 0; + __insert_vmap_area(va); + spin_unlock(&vmap_area_lock); + + return va; +} + +static void rcu_free_va(struct rcu_head *head) +{ + struct vmap_area *va = container_of(head, struct vmap_area, rcu_head); + + kfree(va); +} + +static void __free_vmap_area(struct vmap_area *va) +{ + BUG_ON(RB_EMPTY_NODE(&va->rb_node)); + rb_erase(&va->rb_node, &vmap_area_root); + RB_CLEAR_NODE(&va->rb_node); + list_del_rcu(&va->list); + + call_rcu(&va->rcu_head, rcu_free_va); +} + +/* + * Free a region of KVA allocated by alloc_vmap_area + */ +static void free_vmap_area(struct vmap_area *va) +{ + spin_lock(&vmap_area_lock); + __free_vmap_area(va); + spin_unlock(&vmap_area_lock); +} + +/* + * Clear the pagetable entries of a given vmap_area + */ +static void unmap_vmap_area(struct vmap_area *va) +{ + vunmap_page_range(va->va_start, va->va_end); +} + +/* + * lazy_max_pages is the maximum amount of virtual address space we gather up + * before attempting to purge with a TLB flush. + * + * There is a tradeoff here: a larger number will cover more kernel page tables + * and take slightly longer to purge, but it will linearly reduce the number of + * global TLB flushes that must be performed. It would seem natural to scale + * this number up linearly with the number of CPUs (because vmapping activity + * could also scale linearly with the number of CPUs), however it is likely + * that in practice, workloads might be constrained in other ways that mean + * vmap activity will not scale linearly with CPUs. Also, I want to be + * conservative and not introduce a big latency on huge systems, so go with + * a less aggressive log scale. It will still be an improvement over the old + * code, and it will be simple to change the scale factor if we find that it + * becomes a problem on bigger systems. + */ +static unsigned long lazy_max_pages(void) +{ + unsigned int log; + + log = fls(num_online_cpus()); + + return log * (32UL * 1024 * 1024 / PAGE_SIZE); +} + +static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); + +/* + * Purges all lazily-freed vmap areas. + * + * If sync is 0 then don't purge if there is already a purge in progress. + * If force_flush is 1, then flush kernel TLBs between *start and *end even + * if we found no lazy vmap areas to unmap (callers can use this to optimise + * their own TLB flushing). + * Returns with *start = min(*start, lowest purged address) + * *end = max(*end, highest purged address) + */ +static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, + int sync, int force_flush) +{ + static DEFINE_SPINLOCK(purge_lock); + LIST_HEAD(valist); + struct vmap_area *va; + int nr = 0; + + /* + * If sync is 0 but force_flush is 1, we'll go sync anyway but callers + * should not expect such behaviour. This just simplifies locking for + * the case that isn't actually used at the moment anyway. + */ + if (!sync && !force_flush) { + if (!spin_trylock(&purge_lock)) + return; + } else + spin_lock(&purge_lock); + + rcu_read_lock(); + list_for_each_entry_rcu(va, &vmap_area_list, list) { + if (va->flags & VM_LAZY_FREE) { + if (va->va_start < *start) + *start = va->va_start; + if (va->va_end > *end) + *end = va->va_end; + nr += (va->va_end - va->va_start) >> PAGE_SHIFT; + unmap_vmap_area(va); + list_add_tail(&va->purge_list, &valist); + va->flags |= VM_LAZY_FREEING; + va->flags &= ~VM_LAZY_FREE; + } + } + rcu_read_unlock(); + + if (nr) { + BUG_ON(nr > atomic_read(&vmap_lazy_nr)); + atomic_sub(nr, &vmap_lazy_nr); + } + + if (nr || force_flush) + flush_tlb_kernel_range(*start, *end); + + if (nr) { + spin_lock(&vmap_area_lock); + list_for_each_entry(va, &valist, purge_list) + __free_vmap_area(va); + spin_unlock(&vmap_area_lock); + } + spin_unlock(&purge_lock); +} + +/* + * Kick off a purge of the outstanding lazy areas. + */ +static void purge_vmap_area_lazy(void) +{ + unsigned long start = ULONG_MAX, end = 0; + + __purge_vmap_area_lazy(&start, &end, 0, 0); +} + +/* + * Free and unmap a vmap area + */ +static void free_unmap_vmap_area(struct vmap_area *va) +{ + va->flags |= VM_LAZY_FREE; + atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); + if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages())) + purge_vmap_area_lazy(); +} + +static struct vmap_area *find_vmap_area(unsigned long addr) +{ + struct vmap_area *va; + + spin_lock(&vmap_area_lock); + va = __find_vmap_area(addr); + spin_unlock(&vmap_area_lock); + + return va; +} + +static void free_unmap_vmap_area_addr(unsigned long addr) +{ + struct vmap_area *va; + + va = find_vmap_area(addr); + BUG_ON(!va); + free_unmap_vmap_area(va); +} + + +/*** Per cpu kva allocator ***/ + +/* + * vmap space is limited especially on 32 bit architectures. Ensure there is + * room for at least 16 percpu vmap blocks per CPU. + */ +/* + * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able + * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess + * instead (we just need a rough idea) + */ +#if BITS_PER_LONG == 32 +#define VMALLOC_SPACE (128UL*1024*1024) +#else +#define VMALLOC_SPACE (128UL*1024*1024*1024) +#endif + +#define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE) +#define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */ +#define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */ +#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) +#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ +#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ +#define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ + VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ + VMALLOC_PAGES / NR_CPUS / 16)) + +#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) + +struct vmap_block_queue { + spinlock_t lock; + struct list_head free; + struct list_head dirty; + unsigned int nr_dirty; +}; + +struct vmap_block { + spinlock_t lock; + struct vmap_area *va; + struct vmap_block_queue *vbq; + unsigned long free, dirty; + DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); + DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); + union { + struct { + struct list_head free_list; + struct list_head dirty_list; + }; + struct rcu_head rcu_head; + }; +}; + +/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ +static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); + +/* + * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block + * in the free path. Could get rid of this if we change the API to return a + * "cookie" from alloc, to be passed to free. But no big deal yet. + */ +static DEFINE_SPINLOCK(vmap_block_tree_lock); +static RADIX_TREE(vmap_block_tree, GFP_ATOMIC); + +/* + * We should probably have a fallback mechanism to allocate virtual memory + * out of partially filled vmap blocks. However vmap block sizing should be + * fairly reasonable according to the vmalloc size, so it shouldn't be a + * big problem. + */ + +static unsigned long addr_to_vb_idx(unsigned long addr) +{ + addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1); + addr /= VMAP_BLOCK_SIZE; + return addr; +} + +static struct vmap_block *new_vmap_block(gfp_t gfp_mask) +{ + struct vmap_block_queue *vbq; + struct vmap_block *vb; + struct vmap_area *va; + unsigned long vb_idx; + int node, err; + + node = numa_node_id(); + + vb = kmalloc_node(sizeof(struct vmap_block), + gfp_mask & GFP_RECLAIM_MASK, node); + if (unlikely(!vb)) + return ERR_PTR(-ENOMEM); + + va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, + VMALLOC_START, VMALLOC_END, + node, gfp_mask); + if (unlikely(IS_ERR(va))) { + kfree(vb); + return ERR_PTR(PTR_ERR(va)); + } + + err = radix_tree_preload(gfp_mask); + if (unlikely(err)) { + kfree(vb); + free_vmap_area(va); + return ERR_PTR(err); + } + + spin_lock_init(&vb->lock); + vb->va = va; + vb->free = VMAP_BBMAP_BITS; + vb->dirty = 0; + bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS); + bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); + INIT_LIST_HEAD(&vb->free_list); + INIT_LIST_HEAD(&vb->dirty_list); + + vb_idx = addr_to_vb_idx(va->va_start); + spin_lock(&vmap_block_tree_lock); + err = radix_tree_insert(&vmap_block_tree, vb_idx, vb); + spin_unlock(&vmap_block_tree_lock); + BUG_ON(err); + radix_tree_preload_end(); + + vbq = &get_cpu_var(vmap_block_queue); + vb->vbq = vbq; + spin_lock(&vbq->lock); + list_add(&vb->free_list, &vbq->free); + spin_unlock(&vbq->lock); + put_cpu_var(vmap_cpu_blocks); + + return vb; +} + +static void rcu_free_vb(struct rcu_head *head) +{ + struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head); + + kfree(vb); +} + +static void free_vmap_block(struct vmap_block *vb) +{ + struct vmap_block *tmp; + unsigned long vb_idx; + + spin_lock(&vb->vbq->lock); + if (!list_empty(&vb->free_list)) + list_del(&vb->free_list); + if (!list_empty(&vb->dirty_list)) + list_del(&vb->dirty_list); + spin_unlock(&vb->vbq->lock); + + vb_idx = addr_to_vb_idx(vb->va->va_start); + spin_lock(&vmap_block_tree_lock); + tmp = radix_tree_delete(&vmap_block_tree, vb_idx); + spin_unlock(&vmap_block_tree_lock); + BUG_ON(tmp != vb); + + free_unmap_vmap_area(vb->va); + call_rcu(&vb->rcu_head, rcu_free_vb); +} + +static void *vb_alloc(unsigned long size, gfp_t gfp_mask) +{ + struct vmap_block_queue *vbq; + struct vmap_block *vb; + unsigned long addr = 0; + unsigned int order; + + BUG_ON(size & ~PAGE_MASK); + BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); + order = get_order(size); + +again: + rcu_read_lock(); + vbq = &get_cpu_var(vmap_block_queue); + list_for_each_entry_rcu(vb, &vbq->free, free_list) { + int i; + + spin_lock(&vb->lock); + i = bitmap_find_free_region(vb->alloc_map, + VMAP_BBMAP_BITS, order); + + if (i >= 0) { + addr = vb->va->va_start + (i << PAGE_SHIFT); + BUG_ON(addr_to_vb_idx(addr) != + addr_to_vb_idx(vb->va->va_start)); + vb->free -= 1UL << order; + if (vb->free == 0) { + spin_lock(&vbq->lock); + list_del_init(&vb->free_list); + spin_unlock(&vbq->lock); + } + spin_unlock(&vb->lock); + break; + } + spin_unlock(&vb->lock); + } + put_cpu_var(vmap_cpu_blocks); + rcu_read_unlock(); + + if (!addr) { + vb = new_vmap_block(gfp_mask); + if (IS_ERR(vb)) + return vb; + goto again; + } + + return (void *)addr; +} + +static void vb_free(const void *addr, unsigned long size) +{ + unsigned long offset; + unsigned long vb_idx; + unsigned int order; + struct vmap_block *vb; + + BUG_ON(size & ~PAGE_MASK); + BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); + order = get_order(size); + + offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); + + vb_idx = addr_to_vb_idx((unsigned long)addr); + rcu_read_lock(); + vb = radix_tree_lookup(&vmap_block_tree, vb_idx); + rcu_read_unlock(); + BUG_ON(!vb); + + spin_lock(&vb->lock); + bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order); + if (!vb->dirty) { + spin_lock(&vb->vbq->lock); + list_add(&vb->dirty_list, &vb->vbq->dirty); + spin_unlock(&vb->vbq->lock); + } + vb->dirty += 1UL << order; + if (vb->dirty == VMAP_BBMAP_BITS) { + BUG_ON(vb->free || !list_empty(&vb->free_list)); + spin_unlock(&vb->lock); + free_vmap_block(vb); + } else + spin_unlock(&vb->lock); +} + +/** + * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer + * + * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily + * to amortize TLB flushing overheads. What this means is that any page you + * have now, may, in a former life, have been mapped into kernel virtual + * address by the vmap layer and so there might be some CPUs with TLB entries + * still referencing that page (additional to the regular 1:1 kernel mapping). + * + * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can + * be sure that none of the pages we have control over will have any aliases + * from the vmap layer. + */ +void vm_unmap_aliases(void) +{ + unsigned long start = ULONG_MAX, end = 0; + int cpu; + int flush = 0; + + for_each_possible_cpu(cpu) { + struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); + struct vmap_block *vb; + + rcu_read_lock(); + list_for_each_entry_rcu(vb, &vbq->free, free_list) { + int i; + + spin_lock(&vb->lock); + i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS); + while (i < VMAP_BBMAP_BITS) { + unsigned long s, e; + int j; + j = find_next_zero_bit(vb->dirty_map, + VMAP_BBMAP_BITS, i); + + s = vb->va->va_start + (i << PAGE_SHIFT); + e = vb->va->va_start + (j << PAGE_SHIFT); + vunmap_page_range(s, e); + flush = 1; + + if (s < start) + start = s; + if (e > end) + end = e; + + i = j; + i = find_next_bit(vb->dirty_map, + VMAP_BBMAP_BITS, i); + } + spin_unlock(&vb->lock); + } + rcu_read_unlock(); + } + + __purge_vmap_area_lazy(&start, &end, 1, flush); +} +EXPORT_SYMBOL_GPL(vm_unmap_aliases); + +/** + * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram + * @mem: the pointer returned by vm_map_ram + * @count: the count passed to that vm_map_ram call (cannot unmap partial) + */ +void vm_unmap_ram(const void *mem, unsigned int count) +{ + unsigned long size = count << PAGE_SHIFT; + unsigned long addr = (unsigned long)mem; + + BUG_ON(!addr); + BUG_ON(addr < VMALLOC_START); + BUG_ON(addr > VMALLOC_END); + BUG_ON(addr & (PAGE_SIZE-1)); + + debug_check_no_locks_freed(mem, size); + + if (likely(count <= VMAP_MAX_ALLOC)) + vb_free(mem, size); + else + free_unmap_vmap_area_addr(addr); +} +EXPORT_SYMBOL(vm_unmap_ram); + +/** + * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space) + * @pages: an array of pointers to the pages to be mapped + * @count: number of pages + * @node: prefer to allocate data structures on this node + * @prot: memory protection to use. PAGE_KERNEL for regular RAM + * @returns: a pointer to the address that has been mapped, or NULL on failure + */ +void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) +{ + unsigned long size = count << PAGE_SHIFT; unsigned long addr; + void *mem; + + if (likely(count <= VMAP_MAX_ALLOC)) { + mem = vb_alloc(size, GFP_KERNEL); + if (IS_ERR(mem)) + return NULL; + addr = (unsigned long)mem; + } else { + struct vmap_area *va; + va = alloc_vmap_area(size, PAGE_SIZE, + VMALLOC_START, VMALLOC_END, node, GFP_KERNEL); + if (IS_ERR(va)) + return NULL; + + addr = va->va_start; + mem = (void *)addr; + } + if (vmap_page_range(addr, addr + size, prot, pages) < 0) { + vm_unmap_ram(mem, count); + return NULL; + } + return mem; +} +EXPORT_SYMBOL(vm_map_ram); + +void __init vmalloc_init(void) +{ + int i; + + for_each_possible_cpu(i) { + struct vmap_block_queue *vbq; + + vbq = &per_cpu(vmap_block_queue, i); + spin_lock_init(&vbq->lock); + INIT_LIST_HEAD(&vbq->free); + INIT_LIST_HEAD(&vbq->dirty); + vbq->nr_dirty = 0; + } +} + +void unmap_kernel_range(unsigned long addr, unsigned long size) +{ + unsigned long end = addr + size; + vunmap_page_range(addr, end); + flush_tlb_kernel_range(addr, end); +} + +int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) +{ + unsigned long addr = (unsigned long)area->addr; + unsigned long end = addr + area->size - PAGE_SIZE; + int err; + + err = vmap_page_range(addr, end, prot, *pages); + if (err > 0) { + *pages += err; + err = 0; + } + + return err; +} +EXPORT_SYMBOL_GPL(map_vm_area); + +/*** Old vmalloc interfaces ***/ +DEFINE_RWLOCK(vmlist_lock); +struct vm_struct *vmlist; + +static struct vm_struct *__get_vm_area_node(unsigned long size, + unsigned long flags, unsigned long start, unsigned long end, + int node, gfp_t gfp_mask, void *caller) +{ + static struct vmap_area *va; + struct vm_struct *area; + struct vm_struct *tmp, **p; + unsigned long align = 1; BUG_ON(in_interrupt()); if (flags & VM_IOREMAP) { @@ -232,13 +976,12 @@ __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, align = 1ul << bit; } - addr = ALIGN(start, align); + size = PAGE_ALIGN(size); if (unlikely(!size)) return NULL; area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); - if (unlikely(!area)) return NULL; @@ -247,48 +990,32 @@ __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, */ size += PAGE_SIZE; - write_lock(&vmlist_lock); - for (p = &vmlist; (tmp = *p) != NULL ;p = &tmp->next) { - if ((unsigned long)tmp->addr < addr) { - if((unsigned long)tmp->addr + tmp->size >= addr) - addr = ALIGN(tmp->size + - (unsigned long)tmp->addr, align); - continue; - } - if ((size + addr) < addr) - goto out; - if (size + addr <= (unsigned long)tmp->addr) - goto found; - addr = ALIGN(tmp->size + (unsigned long)tmp->addr, align); - if (addr > end - size) - goto out; + va = alloc_vmap_area(size, align, start, end, node, gfp_mask); + if (IS_ERR(va)) { + kfree(area); + return NULL; } - if ((size + addr) < addr) - goto out; - if (addr > end - size) - goto out; - -found: - area->next = *p; - *p = area; area->flags = flags; - area->addr = (void *)addr; + area->addr = (void *)va->va_start; area->size = size; area->pages = NULL; area->nr_pages = 0; area->phys_addr = 0; area->caller = caller; + va->private = area; + va->flags |= VM_VM_AREA; + + write_lock(&vmlist_lock); + for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { + if (tmp->addr >= area->addr) + break; + } + area->next = *p; + *p = area; write_unlock(&vmlist_lock); return area; - -out: - write_unlock(&vmlist_lock); - kfree(area); - if (printk_ratelimit()) - printk(KERN_WARNING "allocation failed: out of vmalloc space - use vmalloc= to increase size.\n"); - return NULL; } struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, @@ -328,39 +1055,15 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, gfp_mask, __builtin_return_address(0)); } -/* Caller must hold vmlist_lock */ -static struct vm_struct *__find_vm_area(const void *addr) +static struct vm_struct *find_vm_area(const void *addr) { - struct vm_struct *tmp; + struct vmap_area *va; - for (tmp = vmlist; tmp != NULL; tmp = tmp->next) { - if (tmp->addr == addr) - break; - } - - return tmp; -} - -/* Caller must hold vmlist_lock */ -static struct vm_struct *__remove_vm_area(const void *addr) -{ - struct vm_struct **p, *tmp; + va = find_vmap_area((unsigned long)addr); + if (va && va->flags & VM_VM_AREA) + return va->private; - for (p = &vmlist ; (tmp = *p) != NULL ;p = &tmp->next) { - if (tmp->addr == addr) - goto found; - } return NULL; - -found: - unmap_vm_area(tmp); - *p = tmp->next; - - /* - * Remove the guard page. - */ - tmp->size -= PAGE_SIZE; - return tmp; } /** @@ -373,11 +1076,24 @@ found: */ struct vm_struct *remove_vm_area(const void *addr) { - struct vm_struct *v; - write_lock(&vmlist_lock); - v = __remove_vm_area(addr); - write_unlock(&vmlist_lock); - return v; + struct vmap_area *va; + + va = find_vmap_area((unsigned long)addr); + if (va && va->flags & VM_VM_AREA) { + struct vm_struct *vm = va->private; + struct vm_struct *tmp, **p; + free_unmap_vmap_area(va); + vm->size -= PAGE_SIZE; + + write_lock(&vmlist_lock); + for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) + ; + *p = tmp->next; + write_unlock(&vmlist_lock); + + return vm; + } + return NULL; } static void __vunmap(const void *addr, int deallocate_pages) @@ -487,6 +1203,8 @@ void *vmap(struct page **pages, unsigned int count, } EXPORT_SYMBOL(vmap); +static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, + int node, void *caller); static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, int node, void *caller) { @@ -613,10 +1331,8 @@ void *vmalloc_user(unsigned long size) ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); if (ret) { - write_lock(&vmlist_lock); - area = __find_vm_area(ret); + area = find_vm_area(ret); area->flags |= VM_USERMAP; - write_unlock(&vmlist_lock); } return ret; } @@ -696,10 +1412,8 @@ void *vmalloc_32_user(unsigned long size) ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL); if (ret) { - write_lock(&vmlist_lock); - area = __find_vm_area(ret); + area = find_vm_area(ret); area->flags |= VM_USERMAP; - write_unlock(&vmlist_lock); } return ret; } @@ -800,26 +1514,25 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, struct vm_struct *area; unsigned long uaddr = vma->vm_start; unsigned long usize = vma->vm_end - vma->vm_start; - int ret; if ((PAGE_SIZE-1) & (unsigned long)addr) return -EINVAL; - read_lock(&vmlist_lock); - area = __find_vm_area(addr); + area = find_vm_area(addr); if (!area) - goto out_einval_locked; + return -EINVAL; if (!(area->flags & VM_USERMAP)) - goto out_einval_locked; + return -EINVAL; if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) - goto out_einval_locked; - read_unlock(&vmlist_lock); + return -EINVAL; addr += pgoff << PAGE_SHIFT; do { struct page *page = vmalloc_to_page(addr); + int ret; + ret = vm_insert_page(vma, uaddr, page); if (ret) return ret; @@ -832,11 +1545,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, /* Prevent "things" like memory migration? VM_flags need a cleanup... */ vma->vm_flags |= VM_RESERVED; - return ret; - -out_einval_locked: - read_unlock(&vmlist_lock); - return -EINVAL; + return 0; } EXPORT_SYMBOL(remap_vmalloc_range); -- cgit v1.2.3-70-g09d2 From dc52ddc0e6f45b04780b26fc0813509f8e798c42 Mon Sep 17 00:00:00 2001 From: Matt Helsley Date: Sat, 18 Oct 2008 20:27:21 -0700 Subject: container freezer: implement freezer cgroup subsystem This patch implements a new freezer subsystem in the control groups framework. It provides a way to stop and resume execution of all tasks in a cgroup by writing in the cgroup filesystem. The freezer subsystem in the container filesystem defines a file named freezer.state. Writing "FROZEN" to the state file will freeze all tasks in the cgroup. Subsequently writing "RUNNING" will unfreeze the tasks in the cgroup. Reading will return the current state. * Examples of usage : # mkdir /containers/freezer # mount -t cgroup -ofreezer freezer /containers # mkdir /containers/0 # echo $some_pid > /containers/0/tasks to get status of the freezer subsystem : # cat /containers/0/freezer.state RUNNING to freeze all tasks in the container : # echo FROZEN > /containers/0/freezer.state # cat /containers/0/freezer.state FREEZING # cat /containers/0/freezer.state FROZEN to unfreeze all tasks in the container : # echo RUNNING > /containers/0/freezer.state # cat /containers/0/freezer.state RUNNING This is the basic mechanism which should do the right thing for user space task in a simple scenario. It's important to note that freezing can be incomplete. In that case we return EBUSY. This means that some tasks in the cgroup are busy doing something that prevents us from completely freezing the cgroup at this time. After EBUSY, the cgroup will remain partially frozen -- reflected by freezer.state reporting "FREEZING" when read. The state will remain "FREEZING" until one of these things happens: 1) Userspace cancels the freezing operation by writing "RUNNING" to the freezer.state file 2) Userspace retries the freezing operation by writing "FROZEN" to the freezer.state file (writing "FREEZING" is not legal and returns EIO) 3) The tasks that blocked the cgroup from entering the "FROZEN" state disappear from the cgroup's set of tasks. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: export thaw_process] Signed-off-by: Cedric Le Goater Signed-off-by: Matt Helsley Acked-by: Serge E. Hallyn Tested-by: Matt Helsley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/Kconfig | 1 + arch/arm/Kconfig | 2 + arch/avr32/Kconfig | 2 + arch/blackfin/Kconfig | 3 + arch/cris/Kconfig | 2 + arch/frv/Kconfig | 2 + arch/h8300/Kconfig | 2 + arch/ia64/Kconfig | 2 + arch/m32r/Kconfig | 2 + arch/m68k/Kconfig | 2 + arch/m68knommu/Kconfig | 2 + arch/mips/Kconfig | 2 + arch/mn10300/Kconfig | 2 + arch/parisc/Kconfig | 2 + arch/powerpc/Kconfig | 2 + arch/s390/Kconfig | 2 + arch/sh/Kconfig | 2 + arch/sparc/Kconfig | 2 + arch/sparc64/Kconfig | 1 + arch/um/Kconfig | 2 + arch/x86/Kconfig | 1 + arch/xtensa/Kconfig | 1 + include/linux/cgroup_subsys.h | 6 + include/linux/freezer.h | 29 ++-- init/Kconfig | 7 + kernel/Kconfig.freezer | 2 + kernel/Makefile | 1 + kernel/cgroup_freezer.c | 366 ++++++++++++++++++++++++++++++++++++++++++ kernel/freezer.c | 32 ++++ kernel/power/Kconfig | 3 - 30 files changed, 465 insertions(+), 22 deletions(-) create mode 100644 kernel/Kconfig.freezer create mode 100644 kernel/cgroup_freezer.c (limited to 'arch/x86') diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index a0f642b6a4b..6110197757a 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -70,6 +70,7 @@ config AUTO_IRQ_AFFINITY default y source "init/Kconfig" +source "kernel/Kconfig.freezer" menu "System setup" diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 4853f9df37b..df39d20f742 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -192,6 +192,8 @@ config VECTORS_BASE source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "System Type" choice diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig index 7c239a91627..33a5b2969eb 100644 --- a/arch/avr32/Kconfig +++ b/arch/avr32/Kconfig @@ -72,6 +72,8 @@ config GENERIC_BUG source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "System Type and features" source "kernel/time/Kconfig" diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig index 8102c79aaa9..29e71ed6b8a 100644 --- a/arch/blackfin/Kconfig +++ b/arch/blackfin/Kconfig @@ -64,8 +64,11 @@ config HARDWARE_PM depends on OPROFILE source "init/Kconfig" + source "kernel/Kconfig.preempt" +source "kernel/Kconfig.freezer" + menu "Blackfin Processor Options" comment "Processor and Board Settings" diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig index 9389d38f222..07335e719bf 100644 --- a/arch/cris/Kconfig +++ b/arch/cris/Kconfig @@ -62,6 +62,8 @@ config HZ source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "General setup" source "fs/Kconfig.binfmt" diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig index a5aac1b0756..9d1552a9ee2 100644 --- a/arch/frv/Kconfig +++ b/arch/frv/Kconfig @@ -66,6 +66,8 @@ mainmenu "Fujitsu FR-V Kernel Configuration" source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "Fujitsu FR-V system setup" diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig index c7966746fbf..bd1995403c6 100644 --- a/arch/h8300/Kconfig +++ b/arch/h8300/Kconfig @@ -90,6 +90,8 @@ config HZ source "init/Kconfig" +source "kernel/Kconfig.freezer" + source "arch/h8300/Kconfig.cpu" menu "Executable file formats" diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 3b7aa38254a..912c57db2d2 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -7,6 +7,8 @@ mainmenu "IA-64 Linux Kernel Configuration" source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "Processor type and features" config IA64 diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig index 00289c178f8..dbaed4a6381 100644 --- a/arch/m32r/Kconfig +++ b/arch/m32r/Kconfig @@ -42,6 +42,8 @@ config HZ source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "Processor type and features" diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 677c93a490f..836fb66f080 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -62,6 +62,8 @@ mainmenu "Linux/68k Kernel Configuration" source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "Platform dependent setup" config EISA diff --git a/arch/m68knommu/Kconfig b/arch/m68knommu/Kconfig index 0a8998315e5..76b66feb74d 100644 --- a/arch/m68knommu/Kconfig +++ b/arch/m68knommu/Kconfig @@ -75,6 +75,8 @@ config NO_IOPORT source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "Processor type and features" choice diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index b905744d791..5f149b030c0 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -1885,6 +1885,8 @@ config PROBE_INITRD_HEADER add initrd or initramfs image to the kernel image. Otherwise, say N. +source "kernel/Kconfig.freezer" + menu "Bus options (PCI, PCMCIA, EISA, ISA, TC)" config HW_HAS_EISA diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig index dd557c9cf00..9a9f4335887 100644 --- a/arch/mn10300/Kconfig +++ b/arch/mn10300/Kconfig @@ -68,6 +68,8 @@ mainmenu "Matsushita MN10300/AM33 Kernel Configuration" source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "Matsushita MN10300 system setup" diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 8313fccced5..2bd1f6ef5db 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -90,6 +90,8 @@ config ARCH_MAY_HAVE_PC_FDC source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "Processor type and features" diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 380baa1780e..9391199d9e7 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -230,6 +230,8 @@ config PPC_OF_PLATFORM_PCI source "init/Kconfig" +source "kernel/Kconfig.freezer" + source "arch/powerpc/sysdev/Kconfig" source "arch/powerpc/platforms/Kconfig" diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index bc581d8a7cd..70b7645ce74 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -78,6 +78,8 @@ config S390 source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "Base setup" comment "Processor type and features" diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 5131d50f851..2ed5713b754 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -106,6 +106,8 @@ config IO_TRAPPED source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "System type" # diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 97671dac12a..e594559c8db 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -37,6 +37,8 @@ config HZ source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "General machine setup" config SMP diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig index 5446e2a499b..035b15af90d 100644 --- a/arch/sparc64/Kconfig +++ b/arch/sparc64/Kconfig @@ -96,6 +96,7 @@ config GENERIC_HARDIRQS_NO__DO_IRQ def_bool y source "init/Kconfig" +source "kernel/Kconfig.freezer" menu "Processor type and features" diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 6976812cfb1..393bccfe178 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -229,6 +229,8 @@ endmenu source "init/Kconfig" +source "kernel/Kconfig.freezer" + source "drivers/block/Kconfig" source "arch/um/Kconfig.char" diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index bd3c2c53873..49349ba77d8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -193,6 +193,7 @@ config X86_TRAMPOLINE config KTIME_SCALAR def_bool X86_32 source "init/Kconfig" +source "kernel/Kconfig.freezer" menu "Processor type and features" diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 02e417d3d8e..a213260b51e 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -55,6 +55,7 @@ config HZ default 100 source "init/Kconfig" +source "kernel/Kconfig.freezer" menu "Processor type and features" diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index e2877454ec8..9c22396e8b5 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -48,3 +48,9 @@ SUBSYS(devices) #endif /* */ + +#ifdef CONFIG_CGROUP_FREEZER +SUBSYS(freezer) +#endif + +/* */ diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 17e3bb42dd3..8f225339eee 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -46,26 +46,11 @@ static inline bool should_send_signal(struct task_struct *p) /* * Wake up a frozen process - * - * task_lock() is taken to prevent the race with refrigerator() which may - * occur if the freezing of tasks fails. Namely, without the lock, if the - * freezing of tasks failed, thaw_tasks() might have run before a task in - * refrigerator() could call frozen_process(), in which case the task would be - * frozen and no one would thaw it. */ -static inline int thaw_process(struct task_struct *p) -{ - task_lock(p); - if (frozen(p)) { - p->flags &= ~PF_FROZEN; - task_unlock(p); - wake_up_process(p); - return 1; - } - clear_freeze_flag(p); - task_unlock(p); - return 0; -} +extern int __thaw_process(struct task_struct *p); + +/* Takes and releases task alloc lock using task_lock() */ +extern int thaw_process(struct task_struct *p); extern void refrigerator(void); extern int freeze_processes(void); @@ -83,6 +68,12 @@ static inline int try_to_freeze(void) extern bool freeze_task(struct task_struct *p, bool sig_only); extern void cancel_freezing(struct task_struct *p); +#ifdef CONFIG_CGROUP_FREEZER +extern int cgroup_frozen(struct task_struct *task); +#else /* !CONFIG_CGROUP_FREEZER */ +static inline int cgroup_frozen(struct task_struct *task) { return 0; } +#endif /* !CONFIG_CGROUP_FREEZER */ + /* * The PF_FREEZER_SKIP flag should be set by a vfork parent right before it * calls wait_for_completion(&vfork) and reset right after it returns from this diff --git a/init/Kconfig b/init/Kconfig index 5ceff3249a2..8828ed0b205 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -299,6 +299,13 @@ config CGROUP_NS for instance virtual servers and checkpoint/restart jobs. +config CGROUP_FREEZER + bool "control group freezer subsystem" + depends on CGROUPS + help + Provides a way to freeze and unfreeze all tasks in a + cgroup. + config CGROUP_DEVICE bool "Device controller for cgroups" depends on CGROUPS && EXPERIMENTAL diff --git a/kernel/Kconfig.freezer b/kernel/Kconfig.freezer new file mode 100644 index 00000000000..a3bb4cb5253 --- /dev/null +++ b/kernel/Kconfig.freezer @@ -0,0 +1,2 @@ +config FREEZER + def_bool PM_SLEEP || CGROUP_FREEZER diff --git a/kernel/Makefile b/kernel/Makefile index e8194d15d5f..066550aa61c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -56,6 +56,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup.o obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o +obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o obj-$(CONFIG_UTS_NS) += utsname.o diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c new file mode 100644 index 00000000000..b08722de610 --- /dev/null +++ b/kernel/cgroup_freezer.c @@ -0,0 +1,366 @@ +/* + * cgroup_freezer.c - control group freezer subsystem + * + * Copyright IBM Corporation, 2007 + * + * Author : Cedric Le Goater + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include +#include +#include +#include +#include +#include + +enum freezer_state { + STATE_RUNNING = 0, + STATE_FREEZING, + STATE_FROZEN, +}; + +struct freezer { + struct cgroup_subsys_state css; + enum freezer_state state; + spinlock_t lock; /* protects _writes_ to state */ +}; + +static inline struct freezer *cgroup_freezer( + struct cgroup *cgroup) +{ + return container_of( + cgroup_subsys_state(cgroup, freezer_subsys_id), + struct freezer, css); +} + +static inline struct freezer *task_freezer(struct task_struct *task) +{ + return container_of(task_subsys_state(task, freezer_subsys_id), + struct freezer, css); +} + +int cgroup_frozen(struct task_struct *task) +{ + struct freezer *freezer; + enum freezer_state state; + + task_lock(task); + freezer = task_freezer(task); + state = freezer->state; + task_unlock(task); + + return state == STATE_FROZEN; +} + +/* + * cgroups_write_string() limits the size of freezer state strings to + * CGROUP_LOCAL_BUFFER_SIZE + */ +static const char *freezer_state_strs[] = { + "RUNNING", + "FREEZING", + "FROZEN", +}; + +/* + * State diagram + * Transitions are caused by userspace writes to the freezer.state file. + * The values in parenthesis are state labels. The rest are edge labels. + * + * (RUNNING) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN) + * ^ ^ | | + * | \_______RUNNING_______/ | + * \_____________________________RUNNING___________/ + */ + +struct cgroup_subsys freezer_subsys; + +/* Locks taken and their ordering + * ------------------------------ + * css_set_lock + * cgroup_mutex (AKA cgroup_lock) + * task->alloc_lock (AKA task_lock) + * freezer->lock + * task->sighand->siglock + * + * cgroup code forces css_set_lock to be taken before task->alloc_lock + * + * freezer_create(), freezer_destroy(): + * cgroup_mutex [ by cgroup core ] + * + * can_attach(): + * cgroup_mutex + * + * cgroup_frozen(): + * task->alloc_lock (to get task's cgroup) + * + * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): + * task->alloc_lock (to get task's cgroup) + * freezer->lock + * sighand->siglock (if the cgroup is freezing) + * + * freezer_read(): + * cgroup_mutex + * freezer->lock + * read_lock css_set_lock (cgroup iterator start) + * + * freezer_write() (freeze): + * cgroup_mutex + * freezer->lock + * read_lock css_set_lock (cgroup iterator start) + * sighand->siglock + * + * freezer_write() (unfreeze): + * cgroup_mutex + * freezer->lock + * read_lock css_set_lock (cgroup iterator start) + * task->alloc_lock (to prevent races with freeze_task()) + * sighand->siglock + */ +static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, + struct cgroup *cgroup) +{ + struct freezer *freezer; + + freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL); + if (!freezer) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&freezer->lock); + freezer->state = STATE_RUNNING; + return &freezer->css; +} + +static void freezer_destroy(struct cgroup_subsys *ss, + struct cgroup *cgroup) +{ + kfree(cgroup_freezer(cgroup)); +} + + +static int freezer_can_attach(struct cgroup_subsys *ss, + struct cgroup *new_cgroup, + struct task_struct *task) +{ + struct freezer *freezer; + int retval = 0; + + /* + * The call to cgroup_lock() in the freezer.state write method prevents + * a write to that file racing against an attach, and hence the + * can_attach() result will remain valid until the attach completes. + */ + freezer = cgroup_freezer(new_cgroup); + if (freezer->state == STATE_FROZEN) + retval = -EBUSY; + return retval; +} + +static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) +{ + struct freezer *freezer; + + task_lock(task); + freezer = task_freezer(task); + task_unlock(task); + + BUG_ON(freezer->state == STATE_FROZEN); + spin_lock_irq(&freezer->lock); + /* Locking avoids race with FREEZING -> RUNNING transitions. */ + if (freezer->state == STATE_FREEZING) + freeze_task(task, true); + spin_unlock_irq(&freezer->lock); +} + +/* + * caller must hold freezer->lock + */ +static void check_if_frozen(struct cgroup *cgroup, + struct freezer *freezer) +{ + struct cgroup_iter it; + struct task_struct *task; + unsigned int nfrozen = 0, ntotal = 0; + + cgroup_iter_start(cgroup, &it); + while ((task = cgroup_iter_next(cgroup, &it))) { + ntotal++; + /* + * Task is frozen or will freeze immediately when next it gets + * woken + */ + if (frozen(task) || + (task_is_stopped_or_traced(task) && freezing(task))) + nfrozen++; + } + + /* + * Transition to FROZEN when no new tasks can be added ensures + * that we never exist in the FROZEN state while there are unfrozen + * tasks. + */ + if (nfrozen == ntotal) + freezer->state = STATE_FROZEN; + cgroup_iter_end(cgroup, &it); +} + +static int freezer_read(struct cgroup *cgroup, struct cftype *cft, + struct seq_file *m) +{ + struct freezer *freezer; + enum freezer_state state; + + if (!cgroup_lock_live_group(cgroup)) + return -ENODEV; + + freezer = cgroup_freezer(cgroup); + spin_lock_irq(&freezer->lock); + state = freezer->state; + if (state == STATE_FREEZING) { + /* We change from FREEZING to FROZEN lazily if the cgroup was + * only partially frozen when we exitted write. */ + check_if_frozen(cgroup, freezer); + state = freezer->state; + } + spin_unlock_irq(&freezer->lock); + cgroup_unlock(); + + seq_puts(m, freezer_state_strs[state]); + seq_putc(m, '\n'); + return 0; +} + +static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) +{ + struct cgroup_iter it; + struct task_struct *task; + unsigned int num_cant_freeze_now = 0; + + freezer->state = STATE_FREEZING; + cgroup_iter_start(cgroup, &it); + while ((task = cgroup_iter_next(cgroup, &it))) { + if (!freeze_task(task, true)) + continue; + if (task_is_stopped_or_traced(task) && freezing(task)) + /* + * The freeze flag is set so these tasks will + * immediately go into the fridge upon waking. + */ + continue; + if (!freezing(task) && !freezer_should_skip(task)) + num_cant_freeze_now++; + } + cgroup_iter_end(cgroup, &it); + + return num_cant_freeze_now ? -EBUSY : 0; +} + +static int unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) +{ + struct cgroup_iter it; + struct task_struct *task; + + cgroup_iter_start(cgroup, &it); + while ((task = cgroup_iter_next(cgroup, &it))) { + int do_wake; + + task_lock(task); + do_wake = __thaw_process(task); + task_unlock(task); + if (do_wake) + wake_up_process(task); + } + cgroup_iter_end(cgroup, &it); + freezer->state = STATE_RUNNING; + + return 0; +} + +static int freezer_change_state(struct cgroup *cgroup, + enum freezer_state goal_state) +{ + struct freezer *freezer; + int retval = 0; + + freezer = cgroup_freezer(cgroup); + spin_lock_irq(&freezer->lock); + check_if_frozen(cgroup, freezer); /* may update freezer->state */ + if (goal_state == freezer->state) + goto out; + switch (freezer->state) { + case STATE_RUNNING: + retval = try_to_freeze_cgroup(cgroup, freezer); + break; + case STATE_FREEZING: + if (goal_state == STATE_FROZEN) { + /* Userspace is retrying after + * "/bin/echo FROZEN > freezer.state" returned -EBUSY */ + retval = try_to_freeze_cgroup(cgroup, freezer); + break; + } + /* state == FREEZING and goal_state == RUNNING, so unfreeze */ + case STATE_FROZEN: + retval = unfreeze_cgroup(cgroup, freezer); + break; + default: + break; + } +out: + spin_unlock_irq(&freezer->lock); + + return retval; +} + +static int freezer_write(struct cgroup *cgroup, + struct cftype *cft, + const char *buffer) +{ + int retval; + enum freezer_state goal_state; + + if (strcmp(buffer, freezer_state_strs[STATE_RUNNING]) == 0) + goal_state = STATE_RUNNING; + else if (strcmp(buffer, freezer_state_strs[STATE_FROZEN]) == 0) + goal_state = STATE_FROZEN; + else + return -EIO; + + if (!cgroup_lock_live_group(cgroup)) + return -ENODEV; + retval = freezer_change_state(cgroup, goal_state); + cgroup_unlock(); + return retval; +} + +static struct cftype files[] = { + { + .name = "state", + .read_seq_string = freezer_read, + .write_string = freezer_write, + }, +}; + +static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup) +{ + return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files)); +} + +struct cgroup_subsys freezer_subsys = { + .name = "freezer", + .create = freezer_create, + .destroy = freezer_destroy, + .populate = freezer_populate, + .subsys_id = freezer_subsys_id, + .can_attach = freezer_can_attach, + .attach = NULL, + .fork = freezer_fork, + .exit = NULL, +}; diff --git a/kernel/freezer.c b/kernel/freezer.c index cb0931f8930..ba6248b323e 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -120,3 +120,35 @@ void cancel_freezing(struct task_struct *p) spin_unlock_irqrestore(&p->sighand->siglock, flags); } } + +/* + * Wake up a frozen process + * + * task_lock() is needed to prevent the race with refrigerator() which may + * occur if the freezing of tasks fails. Namely, without the lock, if the + * freezing of tasks failed, thaw_tasks() might have run before a task in + * refrigerator() could call frozen_process(), in which case the task would be + * frozen and no one would thaw it. + */ +int __thaw_process(struct task_struct *p) +{ + if (frozen(p)) { + p->flags &= ~PF_FROZEN; + return 1; + } + clear_freeze_flag(p); + return 0; +} + +int thaw_process(struct task_struct *p) +{ + task_lock(p); + if (__thaw_process(p) == 1) { + task_unlock(p); + wake_up_process(p); + return 1; + } + task_unlock(p); + return 0; +} +EXPORT_SYMBOL(thaw_process); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index ebdd7f55273..dcd165f92a8 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -85,9 +85,6 @@ config PM_SLEEP depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE default y -config FREEZER - def_bool PM_SLEEP - config SUSPEND bool "Suspend to RAM and standby" depends on PM && ARCH_SUSPEND_POSSIBLE -- cgit v1.2.3-70-g09d2 From 57cac4d1880527e0baf6c2fda529d2ad1d815aec Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Sat, 18 Oct 2008 20:28:25 -0700 Subject: kdump: make elfcorehdr_addr independent of CONFIG_PROC_VMCORE o elfcorehdr_addr is used by not only the code under CONFIG_PROC_VMCORE but also by the code which is not inside CONFIG_PROC_VMCORE. For example, is_kdump_kernel() is used by powerpc code to determine if kernel is booting after a panic then use previous kernel's TCE table. So even if CONFIG_PROC_VMCORE is not set in second kernel, one should be able to correctly determine that we are booting after a panic and setup calgary iommu accordingly. o So remove the assumption that elfcorehdr_addr is under CONFIG_PROC_VMCORE. o Move definition of elfcorehdr_addr to arch dependent crash files. (Unfortunately crash dump does not have an arch independent file otherwise that would have been the best place). o kexec.c is not the right place as one can Have CRASH_DUMP enabled in second kernel without KEXEC being enabled. o I don't see sh setup code parsing the command line for elfcorehdr_addr. I am wondering how does vmcore interface work on sh. Anyway, I am atleast defining elfcoredhr_addr so that compilation is not broken on sh. Signed-off-by: Vivek Goyal Acked-by: "Eric W. Biederman" Acked-by: Simon Horman Acked-by: Paul Mundt Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/crash_dump.c | 4 ++++ arch/ia64/kernel/setup.c | 9 ++++++++- arch/powerpc/kernel/crash_dump.c | 10 ++++++++-- arch/sh/kernel/crash_dump.c | 3 +++ arch/x86/kernel/crash_dump_32.c | 3 +++ arch/x86/kernel/crash_dump_64.c | 3 +++ arch/x86/kernel/setup.c | 8 +++++++- fs/proc/vmcore.c | 3 --- include/linux/crash_dump.h | 14 ++++++++++---- 9 files changed, 46 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/ia64/kernel/crash_dump.c b/arch/ia64/kernel/crash_dump.c index da60e90eeeb..23e91290e41 100644 --- a/arch/ia64/kernel/crash_dump.c +++ b/arch/ia64/kernel/crash_dump.c @@ -8,10 +8,14 @@ #include #include +#include #include #include +/* Stores the physical address of elf header of crash image. */ +unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; + /** * copy_oldmem_page - copy one page from "oldmem" * @pfn: page frame number to be copied diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index de636b21567..a0286be6c23 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -478,7 +478,12 @@ static __init int setup_nomca(char *s) } early_param("nomca", setup_nomca); -#ifdef CONFIG_PROC_VMCORE +/* + * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by + * is_kdump_kernel() to determine if we are booting after a panic. Hence + * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. + */ +#ifdef CONFIG_CRASH_DUMP /* elfcorehdr= specifies the location of elf core header * stored by the crashed kernel. */ @@ -491,7 +496,9 @@ static int __init parse_elfcorehdr(char *arg) return 0; } early_param("elfcorehdr", parse_elfcorehdr); +#endif +#ifdef CONFIG_PROC_VMCORE int __init reserve_elfcorehdr(unsigned long *start, unsigned long *end) { unsigned long length; diff --git a/arch/powerpc/kernel/crash_dump.c b/arch/powerpc/kernel/crash_dump.c index a323c9b32ee..97e05637972 100644 --- a/arch/powerpc/kernel/crash_dump.c +++ b/arch/powerpc/kernel/crash_dump.c @@ -27,6 +27,9 @@ #define DBG(fmt...) #endif +/* Stores the physical address of elf header of crash image. */ +unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; + void __init reserve_kdump_trampoline(void) { lmb_reserve(0, KDUMP_RESERVE_LIMIT); @@ -66,7 +69,11 @@ void __init setup_kdump_trampoline(void) DBG(" <- setup_kdump_trampoline()\n"); } -#ifdef CONFIG_PROC_VMCORE +/* + * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by + * is_kdump_kernel() to determine if we are booting after a panic. Hence + * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. + */ static int __init parse_elfcorehdr(char *p) { if (p) @@ -75,7 +82,6 @@ static int __init parse_elfcorehdr(char *p) return 1; } __setup("elfcorehdr=", parse_elfcorehdr); -#endif static int __init parse_savemaxmem(char *p) { diff --git a/arch/sh/kernel/crash_dump.c b/arch/sh/kernel/crash_dump.c index 4a2ecbe27d8..95d21625556 100644 --- a/arch/sh/kernel/crash_dump.c +++ b/arch/sh/kernel/crash_dump.c @@ -10,6 +10,9 @@ #include #include +/* Stores the physical address of elf header of crash image. */ +unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; + /** * copy_oldmem_page - copy one page from "oldmem" * @pfn: page frame number to be copied diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index 72d0c56c1b4..f7cdb3b457a 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -13,6 +13,9 @@ static void *kdump_buf_page; +/* Stores the physical address of elf header of crash image. */ +unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; + /** * copy_oldmem_page - copy one page from "oldmem" * @pfn: page frame number to be copied diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index e90a60ef10c..045b36cada6 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -10,6 +10,9 @@ #include #include +/* Stores the physical address of elf header of crash image. */ +unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; + /** * copy_oldmem_page - copy one page from "oldmem" * @pfn: page frame number to be copied diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 2255782e8d4..b2c97874ec0 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -561,7 +561,13 @@ static void __init reserve_standard_io_resources(void) } -#ifdef CONFIG_PROC_VMCORE +/* + * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by + * is_kdump_kernel() to determine if we are booting after a panic. Hence + * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. + */ + +#ifdef CONFIG_CRASH_DUMP /* elfcorehdr= specifies the location of elf core header * stored by the crashed kernel. This option will be passed * by kexec loader to the capture kernel. diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 841368b87a2..4c65ca432d3 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -32,9 +32,6 @@ static size_t elfcorebuf_sz; /* Total size of vmcore file. */ static u64 vmcore_size; -/* Stores the physical address of elf header of crash image. */ -unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; - struct proc_dir_entry *proc_vmcore = NULL; /* Reads a page from the oldmem device from given offset. */ diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 025e4f57510..de027d1db74 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -9,11 +9,7 @@ #define ELFCORE_ADDR_MAX (-1ULL) -#ifdef CONFIG_PROC_VMCORE extern unsigned long long elfcorehdr_addr; -#else -static const unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; -#endif extern ssize_t copy_oldmem_page(unsigned long, char *, size_t, unsigned long, int); @@ -28,6 +24,16 @@ extern struct proc_dir_entry *proc_vmcore; #define vmcore_elf_check_arch(x) (elf_check_arch(x) || vmcore_elf_check_arch_cross(x)) +/* + * is_kdump_kernel() checks whether this kernel is booting after a panic of + * previous kernel or not. This is determined by checking if previous kernel + * has passed the elf core header address on command line. + * + * This is not just a test if CONFIG_CRASH_DUMP is enabled or not. It will + * return 1 if CONFIG_CRASH_DUMP=y and if kernel is booting after a panic of + * previous kernel. + */ + static inline int is_kdump_kernel(void) { return (elfcorehdr_addr != ELFCORE_ADDR_MAX) ? 1 : 0; -- cgit v1.2.3-70-g09d2 From 357c6e63590895dc87cc9300f5a1c27544ea69e8 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Sat, 18 Oct 2008 20:28:42 -0700 Subject: rtc: use bcd2bin/bin2bcd Change various rtc related code to use the new bcd2bin/bin2bcd functions instead of the obsolete BCD_TO_BIN/BIN_TO_BCD/BCD2BIN/BIN2BCD macros. Signed-off-by: Adrian Bunk Acked-by: Alessandro Zummo Cc: David Brownell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/rtc.c | 20 ++++++++++---------- drivers/char/ds1286.c | 34 +++++++++++++++++----------------- drivers/char/ds1302.c | 24 ++++++++++++------------ drivers/char/ip27-rtc.c | 24 ++++++++++++------------ drivers/char/rtc.c | 40 ++++++++++++++++++++-------------------- include/asm-generic/rtc.h | 24 ++++++++++++------------ 6 files changed, 83 insertions(+), 83 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 0a23b5795b2..dd6f2b71561 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -52,7 +52,7 @@ int mach_set_rtc_mmss(unsigned long nowtime) cmos_minutes = CMOS_READ(RTC_MINUTES); if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) - BCD_TO_BIN(cmos_minutes); + cmos_minutes = bcd2bin(cmos_minutes); /* * since we're only adjusting minutes and seconds, @@ -69,8 +69,8 @@ int mach_set_rtc_mmss(unsigned long nowtime) if (abs(real_minutes - cmos_minutes) < 30) { if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { - BIN_TO_BCD(real_seconds); - BIN_TO_BCD(real_minutes); + real_seconds = bin2bcd(real_seconds); + real_minutes = bin2bcd(real_minutes); } CMOS_WRITE(real_seconds,RTC_SECONDS); CMOS_WRITE(real_minutes,RTC_MINUTES); @@ -124,16 +124,16 @@ unsigned long mach_get_cmos_time(void) WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY)); if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) { - BCD_TO_BIN(sec); - BCD_TO_BIN(min); - BCD_TO_BIN(hour); - BCD_TO_BIN(day); - BCD_TO_BIN(mon); - BCD_TO_BIN(year); + sec = bcd2bin(sec); + min = bcd2bin(min); + hour = bcd2bin(hour); + day = bcd2bin(day); + mon = bcd2bin(mon); + year = bcd2bin(year); } if (century) { - BCD_TO_BIN(century); + century = bcd2bin(century); year += century * 100; printk(KERN_INFO "Extended CMOS year: %d\n", century * 100); } else diff --git a/drivers/char/ds1286.c b/drivers/char/ds1286.c index 5329d482b58..0a826d7be10 100644 --- a/drivers/char/ds1286.c +++ b/drivers/char/ds1286.c @@ -210,8 +210,8 @@ static int ds1286_ioctl(struct inode *inode, struct file *file, if (sec != 0) return -EINVAL; - min = BIN2BCD(min); - min = BIN2BCD(hrs); + min = bin2bcd(min); + min = bin2bcd(hrs); spin_lock(&ds1286_lock); rtc_write(hrs, RTC_HOURS_ALARM); @@ -353,7 +353,7 @@ static int ds1286_proc_output(char *buf) ds1286_get_time(&tm); hundredth = rtc_read(RTC_HUNDREDTH_SECOND); - BCD_TO_BIN(hundredth); + hundredth = bcd2bin(hundredth); p += sprintf(p, "rtc_time\t: %02d:%02d:%02d.%02d\n" @@ -477,12 +477,12 @@ static void ds1286_get_time(struct rtc_time *rtc_tm) rtc_write(save_control, RTC_CMD); spin_unlock_irqrestore(&ds1286_lock, flags); - BCD_TO_BIN(rtc_tm->tm_sec); - BCD_TO_BIN(rtc_tm->tm_min); - BCD_TO_BIN(rtc_tm->tm_hour); - BCD_TO_BIN(rtc_tm->tm_mday); - BCD_TO_BIN(rtc_tm->tm_mon); - BCD_TO_BIN(rtc_tm->tm_year); + rtc_tm->tm_sec = bcd2bin(rtc_tm->tm_sec); + rtc_tm->tm_min = bcd2bin(rtc_tm->tm_min); + rtc_tm->tm_hour = bcd2bin(rtc_tm->tm_hour); + rtc_tm->tm_mday = bcd2bin(rtc_tm->tm_mday); + rtc_tm->tm_mon = bcd2bin(rtc_tm->tm_mon); + rtc_tm->tm_year = bcd2bin(rtc_tm->tm_year); /* * Account for differences between how the RTC uses the values @@ -531,12 +531,12 @@ static int ds1286_set_time(struct rtc_time *rtc_tm) if (yrs >= 100) yrs -= 100; - BIN_TO_BCD(sec); - BIN_TO_BCD(min); - BIN_TO_BCD(hrs); - BIN_TO_BCD(day); - BIN_TO_BCD(mon); - BIN_TO_BCD(yrs); + sec = bin2bcd(sec); + min = bin2bcd(min); + hrs = bin2bcd(hrs); + day = bin2bcd(day); + mon = bin2bcd(mon); + yrs = bin2bcd(yrs); spin_lock_irqsave(&ds1286_lock, flags); save_control = rtc_read(RTC_CMD); @@ -572,8 +572,8 @@ static void ds1286_get_alm_time(struct rtc_time *alm_tm) cmd = rtc_read(RTC_CMD); spin_unlock_irqrestore(&ds1286_lock, flags); - BCD_TO_BIN(alm_tm->tm_min); - BCD_TO_BIN(alm_tm->tm_hour); + alm_tm->tm_min = bcd2bin(alm_tm->tm_min); + alm_tm->tm_hour = bcd2bin(alm_tm->tm_hour); alm_tm->tm_sec = 0; } diff --git a/drivers/char/ds1302.c b/drivers/char/ds1302.c index c5e67a62395..170693c93c7 100644 --- a/drivers/char/ds1302.c +++ b/drivers/char/ds1302.c @@ -131,12 +131,12 @@ get_rtc_time(struct rtc_time *rtc_tm) local_irq_restore(flags); - BCD_TO_BIN(rtc_tm->tm_sec); - BCD_TO_BIN(rtc_tm->tm_min); - BCD_TO_BIN(rtc_tm->tm_hour); - BCD_TO_BIN(rtc_tm->tm_mday); - BCD_TO_BIN(rtc_tm->tm_mon); - BCD_TO_BIN(rtc_tm->tm_year); + rtc_tm->tm_sec = bcd2bin(rtc_tm->tm_sec); + rtc_tm->tm_min = bcd2bin(rtc_tm->tm_min); + rtc_tm->tm_hour = bcd2bin(rtc_tm->tm_hour); + rtc_tm->tm_mday = bcd2bin(rtc_tm->tm_mday); + rtc_tm->tm_mon = bcd2bin(rtc_tm->tm_mon); + rtc_tm->tm_year = bcd2bin(rtc_tm->tm_year); /* * Account for differences between how the RTC uses the values @@ -211,12 +211,12 @@ static long rtc_ioctl(struct file *file, unsigned int cmd, unsigned long arg) else yrs -= 1900; /* RTC (70, 71, ... 99) */ - BIN_TO_BCD(sec); - BIN_TO_BCD(min); - BIN_TO_BCD(hrs); - BIN_TO_BCD(day); - BIN_TO_BCD(mon); - BIN_TO_BCD(yrs); + sec = bin2bcd(sec); + min = bin2bcd(min); + hrs = bin2bcd(hrs); + day = bin2bcd(day); + mon = bin2bcd(mon); + yrs = bin2bcd(yrs); lock_kernel(); local_irq_save(flags); diff --git a/drivers/char/ip27-rtc.c b/drivers/char/ip27-rtc.c index ec9d0443d92..2abd881b4cb 100644 --- a/drivers/char/ip27-rtc.c +++ b/drivers/char/ip27-rtc.c @@ -130,12 +130,12 @@ static long rtc_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (yrs >= 100) yrs -= 100; - sec = BIN2BCD(sec); - min = BIN2BCD(min); - hrs = BIN2BCD(hrs); - day = BIN2BCD(day); - mon = BIN2BCD(mon); - yrs = BIN2BCD(yrs); + sec = bin2bcd(sec); + min = bin2bcd(min); + hrs = bin2bcd(hrs); + day = bin2bcd(day); + mon = bin2bcd(mon); + yrs = bin2bcd(yrs); spin_lock_irq(&rtc_lock); rtc->control |= M48T35_RTC_SET; @@ -311,12 +311,12 @@ static void get_rtc_time(struct rtc_time *rtc_tm) rtc->control &= ~M48T35_RTC_READ; spin_unlock_irq(&rtc_lock); - rtc_tm->tm_sec = BCD2BIN(rtc_tm->tm_sec); - rtc_tm->tm_min = BCD2BIN(rtc_tm->tm_min); - rtc_tm->tm_hour = BCD2BIN(rtc_tm->tm_hour); - rtc_tm->tm_mday = BCD2BIN(rtc_tm->tm_mday); - rtc_tm->tm_mon = BCD2BIN(rtc_tm->tm_mon); - rtc_tm->tm_year = BCD2BIN(rtc_tm->tm_year); + rtc_tm->tm_sec = bcd2bin(rtc_tm->tm_sec); + rtc_tm->tm_min = bcd2bin(rtc_tm->tm_min); + rtc_tm->tm_hour = bcd2bin(rtc_tm->tm_hour); + rtc_tm->tm_mday = bcd2bin(rtc_tm->tm_mday); + rtc_tm->tm_mon = bcd2bin(rtc_tm->tm_mon); + rtc_tm->tm_year = bcd2bin(rtc_tm->tm_year); /* * Account for differences between how the RTC uses the values diff --git a/drivers/char/rtc.c b/drivers/char/rtc.c index 17683de9571..32dc89720d5 100644 --- a/drivers/char/rtc.c +++ b/drivers/char/rtc.c @@ -518,17 +518,17 @@ static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel) if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { if (sec < 60) - BIN_TO_BCD(sec); + sec = bin2bcd(sec); else sec = 0xff; if (min < 60) - BIN_TO_BCD(min); + min = bin2bcd(min); else min = 0xff; if (hrs < 24) - BIN_TO_BCD(hrs); + hrs = bin2bcd(hrs); else hrs = 0xff; } @@ -614,12 +614,12 @@ static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel) if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { - BIN_TO_BCD(sec); - BIN_TO_BCD(min); - BIN_TO_BCD(hrs); - BIN_TO_BCD(day); - BIN_TO_BCD(mon); - BIN_TO_BCD(yrs); + sec = bin2bcd(sec); + min = bin2bcd(min); + hrs = bin2bcd(hrs); + day = bin2bcd(day); + mon = bin2bcd(mon); + yrs = bin2bcd(yrs); } save_control = CMOS_READ(RTC_CONTROL); @@ -1099,7 +1099,7 @@ no_irq: spin_unlock_irq(&rtc_lock); if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD) - BCD_TO_BIN(year); /* This should never happen... */ + year = bcd2bin(year); /* This should never happen... */ if (year < 20) { epoch = 2000; @@ -1352,13 +1352,13 @@ static void rtc_get_rtc_time(struct rtc_time *rtc_tm) spin_unlock_irqrestore(&rtc_lock, flags); if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { - BCD_TO_BIN(rtc_tm->tm_sec); - BCD_TO_BIN(rtc_tm->tm_min); - BCD_TO_BIN(rtc_tm->tm_hour); - BCD_TO_BIN(rtc_tm->tm_mday); - BCD_TO_BIN(rtc_tm->tm_mon); - BCD_TO_BIN(rtc_tm->tm_year); - BCD_TO_BIN(rtc_tm->tm_wday); + rtc_tm->tm_sec = bcd2bin(rtc_tm->tm_sec); + rtc_tm->tm_min = bcd2bin(rtc_tm->tm_min); + rtc_tm->tm_hour = bcd2bin(rtc_tm->tm_hour); + rtc_tm->tm_mday = bcd2bin(rtc_tm->tm_mday); + rtc_tm->tm_mon = bcd2bin(rtc_tm->tm_mon); + rtc_tm->tm_year = bcd2bin(rtc_tm->tm_year); + rtc_tm->tm_wday = bcd2bin(rtc_tm->tm_wday); } #ifdef CONFIG_MACH_DECSTATION @@ -1392,9 +1392,9 @@ static void get_rtc_alm_time(struct rtc_time *alm_tm) spin_unlock_irq(&rtc_lock); if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { - BCD_TO_BIN(alm_tm->tm_sec); - BCD_TO_BIN(alm_tm->tm_min); - BCD_TO_BIN(alm_tm->tm_hour); + alm_tm->tm_sec = bcd2bin(alm_tm->tm_sec); + alm_tm->tm_min = bcd2bin(alm_tm->tm_min); + alm_tm->tm_hour = bcd2bin(alm_tm->tm_hour); } } diff --git a/include/asm-generic/rtc.h b/include/asm-generic/rtc.h index 71ef3f0b968..89061c1a67d 100644 --- a/include/asm-generic/rtc.h +++ b/include/asm-generic/rtc.h @@ -84,12 +84,12 @@ static inline unsigned int get_rtc_time(struct rtc_time *time) if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { - BCD_TO_BIN(time->tm_sec); - BCD_TO_BIN(time->tm_min); - BCD_TO_BIN(time->tm_hour); - BCD_TO_BIN(time->tm_mday); - BCD_TO_BIN(time->tm_mon); - BCD_TO_BIN(time->tm_year); + time->tm_sec = bcd2bin(time->tm_sec); + time->tm_min = bcd2bin(time->tm_min); + time->tm_hour = bcd2bin(time->tm_hour); + time->tm_mday = bcd2bin(time->tm_mday); + time->tm_mon = bcd2bin(time->tm_mon); + time->tm_year = bcd2bin(time->tm_year); } #ifdef CONFIG_MACH_DECSTATION @@ -159,12 +159,12 @@ static inline int set_rtc_time(struct rtc_time *time) if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { - BIN_TO_BCD(sec); - BIN_TO_BCD(min); - BIN_TO_BCD(hrs); - BIN_TO_BCD(day); - BIN_TO_BCD(mon); - BIN_TO_BCD(yrs); + sec = bin2bcd(sec); + min = bin2bcd(min); + hrs = bin2bcd(hrs); + day = bin2bcd(day); + mon = bin2bcd(mon); + yrs = bin2bcd(yrs); } save_control = CMOS_READ(RTC_CONTROL); -- cgit v1.2.3-70-g09d2 From c513867561eeb07d24a0bdda1a18a8f91921a301 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 14 Oct 2008 18:08:48 -0400 Subject: ftrace: do not enclose logic in WARN_ON In ftrace, logic is defined in the WARN_ON_ONCE, which can become a nop with some configs. This patch fixes it. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index d073d981a73..8821ceabf51 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -62,6 +62,7 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char *new_code) { unsigned char replaced[MCOUNT_INSN_SIZE]; + int ret; /* * Note: Due to modules and __init, code can @@ -77,8 +78,9 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0) return 2; - WARN_ON_ONCE(__copy_to_user_inatomic((char __user *)ip, new_code, - MCOUNT_INSN_SIZE)); + ret = __copy_to_user_inatomic((char __user *)ip, new_code, + MCOUNT_INSN_SIZE); + WARN_ON_ONCE(ret); sync_core(); -- cgit v1.2.3-70-g09d2 From 606576ce816603d9fe1fb453a88bc6eea16ca709 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 6 Oct 2008 19:06:12 -0400 Subject: ftrace: rename FTRACE to FUNCTION_TRACER Due to confusion between the ftrace infrastructure and the gcc profiling tracer "ftrace", this patch renames the config options from FTRACE to FUNCTION_TRACER. The other two names that are offspring from FTRACE DYNAMIC_FTRACE and FTRACE_MCOUNT_RECORD will stay the same. This patch was generated mostly by script, and partially by hand. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- Makefile | 2 +- arch/arm/Kconfig | 4 ++-- arch/arm/boot/compressed/Makefile | 2 +- arch/arm/include/asm/ftrace.h | 2 +- arch/arm/kernel/armksyms.c | 2 +- arch/arm/kernel/entry-common.S | 4 ++-- arch/powerpc/Kconfig | 2 +- arch/powerpc/Makefile | 2 +- arch/powerpc/include/asm/ftrace.h | 2 +- arch/powerpc/kernel/Makefile | 2 +- arch/powerpc/kernel/entry_32.S | 2 +- arch/powerpc/kernel/entry_64.S | 2 +- arch/powerpc/kernel/ppc_ksyms.c | 2 +- arch/powerpc/platforms/powermac/Makefile | 2 +- arch/sparc64/Kconfig | 2 +- arch/sparc64/Kconfig.debug | 2 +- arch/sparc64/lib/mcount.S | 4 ++-- arch/x86/Kconfig | 2 +- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/entry_32.S | 4 ++-- arch/x86/kernel/entry_64.S | 4 ++-- arch/x86/kernel/i386_ksyms_32.c | 2 +- arch/x86/kernel/x8664_ksyms_64.c | 2 +- arch/x86/xen/Makefile | 2 +- include/asm-x86/ftrace.h | 4 ++-- include/linux/ftrace.h | 12 ++++++------ kernel/Makefile | 4 ++-- kernel/sysctl.c | 2 +- kernel/trace/Kconfig | 17 +++++++++-------- kernel/trace/Makefile | 6 +++--- kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 2 +- kernel/trace/trace_irqsoff.c | 4 ++-- kernel/trace/trace_sched_wakeup.c | 4 ++-- kernel/trace/trace_selftest.c | 4 ++-- lib/Makefile | 2 +- 36 files changed, 61 insertions(+), 60 deletions(-) (limited to 'arch/x86') diff --git a/Makefile b/Makefile index 16e3fbb968a..b7eb70b13ca 100644 --- a/Makefile +++ b/Makefile @@ -536,7 +536,7 @@ KBUILD_CFLAGS += -g KBUILD_AFLAGS += -gdwarf-2 endif -ifdef CONFIG_FTRACE +ifdef CONFIG_FUNCTION_TRACER KBUILD_CFLAGS += -pg endif diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 4853f9df37b..c2f18ea4050 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -16,8 +16,8 @@ config ARM select HAVE_ARCH_KGDB select HAVE_KPROBES if (!XIP_KERNEL) select HAVE_KRETPROBES if (HAVE_KPROBES) - select HAVE_FTRACE if (!XIP_KERNEL) - select HAVE_DYNAMIC_FTRACE if (HAVE_FTRACE) + select HAVE_FUNCTION_TRACER if (!XIP_KERNEL) + select HAVE_DYNAMIC_FTRACE if (HAVE_FUNCTION_TRACER) select HAVE_GENERIC_DMA_COHERENT help The ARM series is a line of low-power-consumption RISC chip designs diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile index 7a03f200788..c47f2a3f8f8 100644 --- a/arch/arm/boot/compressed/Makefile +++ b/arch/arm/boot/compressed/Makefile @@ -70,7 +70,7 @@ SEDFLAGS = s/TEXT_START/$(ZTEXTADDR)/;s/BSS_START/$(ZBSSADDR)/ targets := vmlinux vmlinux.lds piggy.gz piggy.o font.o font.c \ head.o misc.o $(OBJS) -ifeq ($(CONFIG_FTRACE),y) +ifeq ($(CONFIG_FUNCTION_TRACER),y) ORIG_CFLAGS := $(KBUILD_CFLAGS) KBUILD_CFLAGS = $(subst -pg, , $(ORIG_CFLAGS)) endif diff --git a/arch/arm/include/asm/ftrace.h b/arch/arm/include/asm/ftrace.h index 584ef9a8e5a..39c8bc1a006 100644 --- a/arch/arm/include/asm/ftrace.h +++ b/arch/arm/include/asm/ftrace.h @@ -1,7 +1,7 @@ #ifndef _ASM_ARM_FTRACE #define _ASM_ARM_FTRACE -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER #define MCOUNT_ADDR ((long)(mcount)) #define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c index 2357b1cf1cf..c74f766ffc1 100644 --- a/arch/arm/kernel/armksyms.c +++ b/arch/arm/kernel/armksyms.c @@ -183,6 +183,6 @@ EXPORT_SYMBOL(_find_next_bit_be); EXPORT_SYMBOL(copy_page); -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER EXPORT_SYMBOL(mcount); #endif diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 3aa14dcc5ba..06269ea375c 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -101,7 +101,7 @@ ENDPROC(ret_from_fork) #undef CALL #define CALL(x) .long x -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE ENTRY(mcount) stmdb sp!, {r0-r3, lr} @@ -149,7 +149,7 @@ trace: ftrace_stub: mov pc, lr -#endif /* CONFIG_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ /*============================================================================= * SWI handler diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 380baa1780e..97d86702e2d 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -112,7 +112,7 @@ config PPC bool default y select HAVE_DYNAMIC_FTRACE - select HAVE_FTRACE + select HAVE_FUNCTION_TRACER select ARCH_WANT_OPTIONAL_GPIOLIB select HAVE_IDE select HAVE_IOREMAP_PROT diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 24dd1a37f8f..1f066706994 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -122,7 +122,7 @@ KBUILD_CFLAGS += -mcpu=powerpc endif # Work around a gcc code-gen bug with -fno-omit-frame-pointer. -ifeq ($(CONFIG_FTRACE),y) +ifeq ($(CONFIG_FUNCTION_TRACER),y) KBUILD_CFLAGS += -mno-sched-epilog endif diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index de921326cca..b298f7a631e 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -1,7 +1,7 @@ #ifndef _ASM_POWERPC_FTRACE #define _ASM_POWERPC_FTRACE -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER #define MCOUNT_ADDR ((long)(_mcount)) #define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index fdb58253fa5..92673b43858 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -12,7 +12,7 @@ CFLAGS_prom_init.o += -fPIC CFLAGS_btext.o += -fPIC endif -ifdef CONFIG_FTRACE +ifdef CONFIG_FUNCTION_TRACER # Do not trace early boot code CFLAGS_REMOVE_cputable.o = -pg -mno-sched-epilog CFLAGS_REMOVE_prom_init.o = -pg -mno-sched-epilog diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 1cbbf703364..7ecc0d1855c 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -1158,7 +1158,7 @@ machine_check_in_rtas: #endif /* CONFIG_PPC_RTAS */ -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE _GLOBAL(mcount) _GLOBAL(_mcount) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index fd8b4bae9b0..e6d52845854 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -884,7 +884,7 @@ _GLOBAL(enter_prom) mtlr r0 blr -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE _GLOBAL(mcount) _GLOBAL(_mcount) diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c index 8edc2359c41..260089dccfb 100644 --- a/arch/powerpc/kernel/ppc_ksyms.c +++ b/arch/powerpc/kernel/ppc_ksyms.c @@ -68,7 +68,7 @@ EXPORT_SYMBOL(single_step_exception); EXPORT_SYMBOL(sys_sigreturn); #endif -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER EXPORT_SYMBOL(_mcount); #endif diff --git a/arch/powerpc/platforms/powermac/Makefile b/arch/powerpc/platforms/powermac/Makefile index be60d64be7a..50f16939255 100644 --- a/arch/powerpc/platforms/powermac/Makefile +++ b/arch/powerpc/platforms/powermac/Makefile @@ -1,6 +1,6 @@ CFLAGS_bootx_init.o += -fPIC -ifdef CONFIG_FTRACE +ifdef CONFIG_FUNCTION_TRACER # Do not trace early boot code CFLAGS_REMOVE_bootx_init.o = -pg -mno-sched-epilog endif diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig index 5446e2a499b..d269400d286 100644 --- a/arch/sparc64/Kconfig +++ b/arch/sparc64/Kconfig @@ -12,7 +12,7 @@ config SPARC64 bool default y select HAVE_DYNAMIC_FTRACE - select HAVE_FTRACE + select HAVE_FUNCTION_TRACER select HAVE_IDE select HAVE_LMB select HAVE_ARCH_KGDB diff --git a/arch/sparc64/Kconfig.debug b/arch/sparc64/Kconfig.debug index d6d32d178fc..c40515c0669 100644 --- a/arch/sparc64/Kconfig.debug +++ b/arch/sparc64/Kconfig.debug @@ -33,7 +33,7 @@ config DEBUG_PAGEALLOC config MCOUNT bool - depends on STACK_DEBUG || FTRACE + depends on STACK_DEBUG || FUNCTION_TRACER default y config FRAME_POINTER diff --git a/arch/sparc64/lib/mcount.S b/arch/sparc64/lib/mcount.S index fad90ddb3a2..7ce9c65f359 100644 --- a/arch/sparc64/lib/mcount.S +++ b/arch/sparc64/lib/mcount.S @@ -93,7 +93,7 @@ mcount: nop 1: #endif -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE mov %o7, %o0 .globl mcount_call @@ -119,7 +119,7 @@ mcount_call: .size _mcount,.-_mcount .size mcount,.-mcount -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER .globl ftrace_stub .type ftrace_stub,#function ftrace_stub: diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 40ee8080956..290e21aa774 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -28,7 +28,7 @@ config X86 select HAVE_KRETPROBES select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE - select HAVE_FTRACE + select HAVE_FUNCTION_TRACER select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) select HAVE_ARCH_KGDB if !X86_VOYAGER select HAVE_ARCH_TRACEHOOK diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0d41f0343dc..ec3d30136bf 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -6,7 +6,7 @@ extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinu CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) -ifdef CONFIG_FTRACE +ifdef CONFIG_FUNCTION_TRACER # Do not profile debug and lowlevel utilities CFLAGS_REMOVE_tsc.o = -pg CFLAGS_REMOVE_rtc.o = -pg diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 4e4269c73bb..9d49facc21f 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1149,7 +1149,7 @@ ENDPROC(xen_failsafe_callback) #endif /* CONFIG_XEN */ -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE ENTRY(mcount) @@ -1204,7 +1204,7 @@ trace: jmp ftrace_stub END(mcount) #endif /* CONFIG_DYNAMIC_FTRACE */ -#endif /* CONFIG_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ .section .rodata,"a" #include "syscall_table_32.S" diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 09e7145484c..b86f332c96a 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -61,7 +61,7 @@ .code64 -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE ENTRY(mcount) retq @@ -138,7 +138,7 @@ trace: jmp ftrace_stub END(mcount) #endif /* CONFIG_DYNAMIC_FTRACE */ -#endif /* CONFIG_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index dd7ebee446a..43cec6bdda6 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -5,7 +5,7 @@ #include #include -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER /* mcount is defined in assembly */ EXPORT_SYMBOL(mcount); #endif diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index b545f371b5f..695e426aa35 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -12,7 +12,7 @@ #include #include -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER /* mcount is defined in assembly */ EXPORT_SYMBOL(mcount); #endif diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 313947940a1..6dcefba7836 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -1,4 +1,4 @@ -ifdef CONFIG_FTRACE +ifdef CONFIG_FUNCTION_TRACER # Do not profile debug and lowlevel utilities CFLAGS_REMOVE_spinlock.o = -pg CFLAGS_REMOVE_time.o = -pg diff --git a/include/asm-x86/ftrace.h b/include/asm-x86/ftrace.h index 1bb6f9bbe1a..233bb9b869c 100644 --- a/include/asm-x86/ftrace.h +++ b/include/asm-x86/ftrace.h @@ -1,7 +1,7 @@ #ifndef ASM_X86__FTRACE_H #define ASM_X86__FTRACE_H -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER #define MCOUNT_ADDR ((long)(mcount)) #define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ @@ -19,6 +19,6 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) } #endif -#endif /* CONFIG_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ #endif /* ASM_X86__FTRACE_H */ diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index a3d46151be1..0e952958915 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -8,7 +8,7 @@ #include #include -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER extern int ftrace_enabled; extern int @@ -36,12 +36,12 @@ void clear_ftrace_function(void); extern void ftrace_stub(unsigned long a0, unsigned long a1); -#else /* !CONFIG_FTRACE */ +#else /* !CONFIG_FUNCTION_TRACER */ # define register_ftrace_function(ops) do { } while (0) # define unregister_ftrace_function(ops) do { } while (0) # define clear_ftrace_function(ops) do { } while (0) static inline void ftrace_kill_atomic(void) { } -#endif /* CONFIG_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_DYNAMIC_FTRACE # define FTRACE_HASHBITS 10 @@ -101,7 +101,7 @@ void ftrace_kill_atomic(void); static inline void tracer_disable(void) { -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER ftrace_enabled = 0; #endif } @@ -113,7 +113,7 @@ static inline void tracer_disable(void) */ static inline int __ftrace_enabled_save(void) { -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER int saved_ftrace_enabled = ftrace_enabled; ftrace_enabled = 0; return saved_ftrace_enabled; @@ -124,7 +124,7 @@ static inline int __ftrace_enabled_save(void) static inline void __ftrace_enabled_restore(int enabled) { -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER ftrace_enabled = enabled; #endif } diff --git a/kernel/Makefile b/kernel/Makefile index 8f9ce7ec21b..85f588a9d0b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -13,7 +13,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ CFLAGS_REMOVE_sched.o = -mno-spe -ifdef CONFIG_FTRACE +ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files CFLAGS_REMOVE_lockdep.o = -pg CFLAGS_REMOVE_lockdep_proc.o = -pg @@ -86,7 +86,7 @@ obj-$(CONFIG_MARKERS) += marker.o obj-$(CONFIG_TRACEPOINTS) += tracepoint.o obj-$(CONFIG_LATENCYTOP) += latencytop.o obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o -obj-$(CONFIG_FTRACE) += trace/ +obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 617d41e4d6a..619eb9f3acd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -464,7 +464,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER { .ctl_name = CTL_UNNUMBERED, .procname = "ftrace_enabled", diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 5866edbc2ed..3533c583df4 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -1,11 +1,12 @@ # -# Architectures that offer an FTRACE implementation should select HAVE_FTRACE: +# Architectures that offer an FUNCTION_TRACER implementation should +# select HAVE_FUNCTION_TRACER: # config NOP_TRACER bool -config HAVE_FTRACE +config HAVE_FUNCTION_TRACER bool select NOP_TRACER @@ -28,9 +29,9 @@ config TRACING select STACKTRACE select TRACEPOINTS -config FTRACE +config FUNCTION_TRACER bool "Kernel Function Tracer" - depends on HAVE_FTRACE + depends on HAVE_FUNCTION_TRACER depends on DEBUG_KERNEL select FRAME_POINTER select TRACING @@ -136,9 +137,9 @@ config BOOT_TRACER config STACK_TRACER bool "Trace max stack" - depends on HAVE_FTRACE + depends on HAVE_FUNCTION_TRACER depends on DEBUG_KERNEL - select FTRACE + select FUNCTION_TRACER select STACKTRACE help This special tracer records the maximum stack footprint of the @@ -155,7 +156,7 @@ config STACK_TRACER config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" - depends on FTRACE + depends on FUNCTION_TRACER depends on HAVE_DYNAMIC_FTRACE depends on DEBUG_KERNEL default y @@ -165,7 +166,7 @@ config DYNAMIC_FTRACE with a No-Op instruction) as they are called. A table is created to dynamically enable them again. - This way a CONFIG_FTRACE kernel is slightly larger, but otherwise + This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but otherwise has native performance as long as no tracing is active. The changes to the code are done by a kernel thread that diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index a85dfba88ba..c8228b1a49e 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -1,7 +1,7 @@ # Do not instrument the tracer itself: -ifdef CONFIG_FTRACE +ifdef CONFIG_FUNCTION_TRACER ORIG_CFLAGS := $(KBUILD_CFLAGS) KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) @@ -10,13 +10,13 @@ CFLAGS_trace_selftest_dynamic.o = -pg obj-y += trace_selftest_dynamic.o endif -obj-$(CONFIG_FTRACE) += libftrace.o +obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o obj-$(CONFIG_RING_BUFFER) += ring_buffer.o obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o -obj-$(CONFIG_FTRACE) += trace_functions.o +obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d345d649d07..aeb2f2505bc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -851,7 +851,7 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) preempt_enable_notrace(); } -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER static void function_trace_call(unsigned long ip, unsigned long parent_ip) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f1f99572cde..6889ca48f1f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -335,7 +335,7 @@ void update_max_tr_single(struct trace_array *tr, extern cycle_t ftrace_now(int cpu); -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER void tracing_start_function_trace(void); void tracing_stop_function_trace(void); #else diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index a7db7f040ae..9c74071c10e 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -63,7 +63,7 @@ irq_trace(void) */ static __cacheline_aligned_in_smp unsigned long max_sequence; -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER /* * irqsoff uses its own tracer function to keep the overhead down: */ @@ -104,7 +104,7 @@ static struct ftrace_ops trace_ops __read_mostly = { .func = irqsoff_tracer_call, }; -#endif /* CONFIG_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ /* * Should this new latency be reported/recorded? diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index fe4a252c236..3ae93f16b56 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -31,7 +31,7 @@ static raw_spinlock_t wakeup_lock = static void __wakeup_reset(struct trace_array *tr); -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER /* * irqsoff uses its own tracer function to keep the overhead down: */ @@ -96,7 +96,7 @@ static struct ftrace_ops trace_ops __read_mostly = { .func = wakeup_tracer_call, }; -#endif /* CONFIG_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ /* * Should this new latency be reported/recorded? diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 09cf230d7ec..95815d26a04 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -70,7 +70,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) return ret; } -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE @@ -226,7 +226,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) return ret; } -#endif /* CONFIG_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_IRQSOFF_TRACER int diff --git a/lib/Makefile b/lib/Makefile index 16feaab057b..7cb65d85aeb 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -2,7 +2,7 @@ # Makefile for some libs needed in the kernel. # -ifdef CONFIG_FTRACE +ifdef CONFIG_FUNCTION_TRACER ORIG_CFLAGS := $(KBUILD_CFLAGS) KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) endif -- cgit v1.2.3-70-g09d2 From d768cb6929773060171eee8397a63883f60ddc07 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 25 Aug 2008 15:44:59 -0600 Subject: x86/PCI: follow lspci device/vendor style Use "[%04x:%04x]" for PCI vendor/device IDs to follow the format used by lspci(8). Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/irq.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 006599db0dc..52a1de1128c 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -493,7 +493,7 @@ static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq if (pirq <= 4) irq = read_config_nybble(router, 0x56, pirq - 1); dev_info(&dev->dev, - "AMD756: dev [%04x/%04x], router PIRQ %d get IRQ %d\n", + "AMD756: dev [%04x:%04x], router PIRQ %d get IRQ %d\n", dev->vendor, dev->device, pirq, irq); return irq; } @@ -501,7 +501,7 @@ static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) { dev_info(&dev->dev, - "AMD756: dev [%04x/%04x], router PIRQ %d set IRQ %d\n", + "AMD756: dev [%04x:%04x], router PIRQ %d set IRQ %d\n", dev->vendor, dev->device, pirq, irq); if (pirq <= 4) write_config_nybble(router, 0x56, pirq - 1, irq); @@ -823,7 +823,7 @@ static void __init pirq_find_router(struct irq_router *r) r->get = NULL; r->set = NULL; - DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n", + DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for [%04x:%04x]\n", rt->rtr_vendor, rt->rtr_device); pirq_router_dev = pci_get_bus_and_slot(rt->rtr_bus, rt->rtr_devfn); @@ -843,7 +843,7 @@ static void __init pirq_find_router(struct irq_router *r) h->probe(r, pirq_router_dev, pirq_router_dev->device)) break; } - dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x/%04x]\n", + dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x:%04x]\n", pirq_router.name, pirq_router_dev->vendor, pirq_router_dev->device); -- cgit v1.2.3-70-g09d2 From 37a84ec668ba251ae02cf2c2c664baf6b247ae1f Mon Sep 17 00:00:00 2001 From: Seth Heasley Date: Thu, 28 Aug 2008 15:40:59 -0700 Subject: x86/PCI: irq and pci_ids patch for Intel Ibex Peak DeviceIDs This patch updates the Intel Ibex Peak (PCH) LPC and SMBus Controller DeviceIDs. The LPC Controller ID is set by Firmware within the range of 0x3b00-3b1f. This range is included in pci_ids.h using min and max values, and irq.c now has code to handle the range (in lieu of 32 additions to a SWITCH statement). The SMBus Controller ID is a fixed-value and will not change. Signed-off-by: Seth Heasley Acked-by: Jean Delvare Signed-off-by: Jesse Barnes --- arch/x86/pci/irq.c | 11 +++++++++-- include/linux/pci_ids.h | 6 +++--- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 52a1de1128c..bf69dbe08bf 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -590,13 +590,20 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route case PCI_DEVICE_ID_INTEL_ICH10_1: case PCI_DEVICE_ID_INTEL_ICH10_2: case PCI_DEVICE_ID_INTEL_ICH10_3: - case PCI_DEVICE_ID_INTEL_PCH_0: - case PCI_DEVICE_ID_INTEL_PCH_1: r->name = "PIIX/ICH"; r->get = pirq_piix_get; r->set = pirq_piix_set; return 1; } + + if ((device >= PCI_DEVICE_ID_INTEL_PCH_LPC_MIN) && + (device <= PCI_DEVICE_ID_INTEL_PCH_LPC_MAX)) { + r->name = "PIIX/ICH"; + r->get = pirq_piix_get; + r->set = pirq_piix_set; + return 1; + } + return 0; } diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 8edddc240e4..e5d344bfcb7 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2454,9 +2454,9 @@ #define PCI_DEVICE_ID_INTEL_ICH10_3 0x3a1a #define PCI_DEVICE_ID_INTEL_ICH10_4 0x3a30 #define PCI_DEVICE_ID_INTEL_ICH10_5 0x3a60 -#define PCI_DEVICE_ID_INTEL_PCH_0 0x3b10 -#define PCI_DEVICE_ID_INTEL_PCH_1 0x3b11 -#define PCI_DEVICE_ID_INTEL_PCH_2 0x3b30 +#define PCI_DEVICE_ID_INTEL_PCH_LPC_MIN 0x3b00 +#define PCI_DEVICE_ID_INTEL_PCH_LPC_MAX 0x3b1f +#define PCI_DEVICE_ID_INTEL_PCH_SMBUS 0x3b30 #define PCI_DEVICE_ID_INTEL_IOAT_SNB 0x402f #define PCI_DEVICE_ID_INTEL_5100_16 0x65f0 #define PCI_DEVICE_ID_INTEL_5100_21 0x65f5 -- cgit v1.2.3-70-g09d2 From db7a6d8d01b21829d28638258cbbc9553210bac1 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 20 Oct 2008 11:24:31 -0700 Subject: Update .gitignore files for generated targets The generated 'capflags.c' file wasn't properly ignored, and the list of files in scripts/basic/ wasn't up-to-date. Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/.gitignore | 1 + scripts/basic/.gitignore | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/cpu/.gitignore (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/.gitignore b/arch/x86/kernel/cpu/.gitignore new file mode 100644 index 00000000000..667df55a439 --- /dev/null +++ b/arch/x86/kernel/cpu/.gitignore @@ -0,0 +1 @@ +capflags.c diff --git a/scripts/basic/.gitignore b/scripts/basic/.gitignore index 7304e19782c..bf8b199ec59 100644 --- a/scripts/basic/.gitignore +++ b/scripts/basic/.gitignore @@ -1,3 +1,3 @@ +hash fixdep -split-include docproc -- cgit v1.2.3-70-g09d2 From f4432c5caec5fa95ea7eefd00f8e6cee17e2e023 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Mon, 20 Oct 2008 13:31:45 -0400 Subject: Update email addresses. Update assorted email addresses and related info to point to a single current, valid address. additionally - trivial CREDITS entry updates. (Not that this file means much any more) - remove arjans dead redhat.com address from powernow driver Signed-off-by: Dave Jones Signed-off-by: Linus Torvalds --- CREDITS | 12 ++++++------ MAINTAINERS | 2 +- arch/x86/kernel/cpu/cpufreq/longhaul.c | 4 ++-- arch/x86/kernel/cpu/cpufreq/powernow-k6.c | 2 +- arch/x86/kernel/cpu/cpufreq/powernow-k7.c | 4 ++-- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 2 +- arch/x86/kernel/cpu/cpufreq/speedstep-ich.c | 2 +- arch/x86/kernel/cpu/mcheck/k7.c | 4 ++-- arch/x86/kernel/cpu/mcheck/mce_32.c | 2 +- arch/x86/kernel/cpu/mcheck/non-fatal.c | 2 +- drivers/char/agp/ali-agp.c | 2 +- drivers/char/agp/amd64-agp.c | 2 +- drivers/char/agp/ati-agp.c | 2 +- drivers/char/agp/backend.c | 2 +- drivers/char/agp/intel-agp.c | 2 +- drivers/char/agp/nvidia-agp.c | 2 +- drivers/char/agp/via-agp.c | 2 +- scripts/checkpatch.pl | 2 +- 18 files changed, 26 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/CREDITS b/CREDITS index c62dcb3b7e2..2358846f06b 100644 --- a/CREDITS +++ b/CREDITS @@ -1653,14 +1653,14 @@ S: Chapel Hill, North Carolina 27514-4818 S: USA N: Dave Jones -E: davej@codemonkey.org.uk +E: davej@redhat.com W: http://www.codemonkey.org.uk -D: x86 errata/setup maintenance. -D: AGPGART driver. +D: Assorted VIA x86 support. +D: 2.5 AGPGART overhaul. D: CPUFREQ maintenance. -D: Backport/Forwardport merge monkey. -D: Various Janitor work. -S: United Kingdom +D: Fedora kernel maintainence. +D: Misc/Other. +S: 314 Littleton Rd, Westford, MA 01886, USA N: Martin Josfsson E: gandalf@wlug.westbo.se diff --git a/MAINTAINERS b/MAINTAINERS index 355c192d699..5c3f79c2638 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1198,7 +1198,7 @@ S: Maintained CPU FREQUENCY DRIVERS P: Dave Jones -M: davej@codemonkey.org.uk +M: davej@redhat.com L: cpufreq@vger.kernel.org W: http://www.codemonkey.org.uk/projects/cpufreq/ T: git kernel.org/pub/scm/linux/kernel/git/davej/cpufreq.git diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c index 06fcce516d5..b0461856acf 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.c +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c @@ -1,5 +1,5 @@ /* - * (C) 2001-2004 Dave Jones. + * (C) 2001-2004 Dave Jones. * (C) 2002 Padraig Brady. * * Licensed under the terms of the GNU GPL License version 2. @@ -1019,7 +1019,7 @@ MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor"); module_param(revid_errata, int, 0644); MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID"); -MODULE_AUTHOR ("Dave Jones "); +MODULE_AUTHOR ("Dave Jones "); MODULE_DESCRIPTION ("Longhaul driver for VIA Cyrix processors."); MODULE_LICENSE ("GPL"); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c index b5ced806a31..c1ac5790c63 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c @@ -246,7 +246,7 @@ static void __exit powernow_k6_exit(void) } -MODULE_AUTHOR("Arjan van de Ven , Dave Jones , Dominik Brodowski "); +MODULE_AUTHOR("Arjan van de Ven, Dave Jones , Dominik Brodowski "); MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors."); MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c index 0a61159d7b7..7c7d56b4313 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c @@ -1,6 +1,6 @@ /* * AMD K7 Powernow driver. - * (C) 2003 Dave Jones on behalf of SuSE Labs. + * (C) 2003 Dave Jones on behalf of SuSE Labs. * (C) 2003-2004 Dave Jones * * Licensed under the terms of the GNU GPL License version 2. @@ -692,7 +692,7 @@ static void __exit powernow_exit (void) module_param(acpi_force, int, 0444); MODULE_PARM_DESC(acpi_force, "Force ACPI to be used."); -MODULE_AUTHOR ("Dave Jones "); +MODULE_AUTHOR ("Dave Jones "); MODULE_DESCRIPTION ("Powernow driver for AMD K7 processors."); MODULE_LICENSE ("GPL"); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 84bb395038d..008d23ba491 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -7,7 +7,7 @@ * Support : mark.langsdorf@amd.com * * Based on the powernow-k7.c module written by Dave Jones. - * (C) 2003 Dave Jones on behalf of SuSE Labs + * (C) 2003 Dave Jones on behalf of SuSE Labs * (C) 2004 Dominik Brodowski * (C) 2004 Pavel Machek * Licensed under the terms of the GNU GPL License version 2. diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 191f7263c61..04d0376b64b 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -431,7 +431,7 @@ static void __exit speedstep_exit(void) } -MODULE_AUTHOR ("Dave Jones , Dominik Brodowski "); +MODULE_AUTHOR ("Dave Jones , Dominik Brodowski "); MODULE_DESCRIPTION ("Speedstep driver for Intel mobile processors on chipsets with ICH-M southbridges."); MODULE_LICENSE ("GPL"); diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c index f390c9f6635..dd3af6e7b39 100644 --- a/arch/x86/kernel/cpu/mcheck/k7.c +++ b/arch/x86/kernel/cpu/mcheck/k7.c @@ -1,6 +1,6 @@ /* - * Athlon/Hammer specific Machine Check Exception Reporting - * (C) Copyright 2002 Dave Jones + * Athlon specific Machine Check Exception Reporting + * (C) Copyright 2002 Dave Jones */ #include diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c index 774d87cfd8c..0ebf3fc6a61 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_32.c +++ b/arch/x86/kernel/cpu/mcheck/mce_32.c @@ -1,6 +1,6 @@ /* * mce.c - x86 Machine Check Exception Reporting - * (c) 2002 Alan Cox , Dave Jones + * (c) 2002 Alan Cox , Dave Jones */ #include diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c index cc1fccdd31e..a74af128efc 100644 --- a/arch/x86/kernel/cpu/mcheck/non-fatal.c +++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c @@ -1,7 +1,7 @@ /* * Non Fatal Machine Check Exception Reporting * - * (C) Copyright 2002 Dave Jones. + * (C) Copyright 2002 Dave Jones. * * This file contains routines to check for non-fatal MCEs every 15s * diff --git a/drivers/char/agp/ali-agp.c b/drivers/char/agp/ali-agp.c index 31dcd9142d5..dc8d1a90971 100644 --- a/drivers/char/agp/ali-agp.c +++ b/drivers/char/agp/ali-agp.c @@ -417,6 +417,6 @@ static void __exit agp_ali_cleanup(void) module_init(agp_ali_init); module_exit(agp_ali_cleanup); -MODULE_AUTHOR("Dave Jones "); +MODULE_AUTHOR("Dave Jones "); MODULE_LICENSE("GPL and additional rights"); diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c index 2812ee2b165..52f4361eb6e 100644 --- a/drivers/char/agp/amd64-agp.c +++ b/drivers/char/agp/amd64-agp.c @@ -772,6 +772,6 @@ module_init(agp_amd64_init); module_exit(agp_amd64_cleanup); #endif -MODULE_AUTHOR("Dave Jones , Andi Kleen"); +MODULE_AUTHOR("Dave Jones , Andi Kleen"); module_param(agp_try_unsupported, bool, 0); MODULE_LICENSE("GPL"); diff --git a/drivers/char/agp/ati-agp.c b/drivers/char/agp/ati-agp.c index ae2791b926b..f1537eece07 100644 --- a/drivers/char/agp/ati-agp.c +++ b/drivers/char/agp/ati-agp.c @@ -561,6 +561,6 @@ static void __exit agp_ati_cleanup(void) module_init(agp_ati_init); module_exit(agp_ati_cleanup); -MODULE_AUTHOR("Dave Jones "); +MODULE_AUTHOR("Dave Jones "); MODULE_LICENSE("GPL and additional rights"); diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c index 3a3cc03d401..8c617ad7497 100644 --- a/drivers/char/agp/backend.c +++ b/drivers/char/agp/backend.c @@ -349,7 +349,7 @@ static __init int agp_setup(char *s) __setup("agp=", agp_setup); #endif -MODULE_AUTHOR("Dave Jones "); +MODULE_AUTHOR("Dave Jones "); MODULE_DESCRIPTION("AGP GART driver"); MODULE_LICENSE("GPL and additional rights"); MODULE_ALIAS_MISCDEV(AGPGART_MINOR); diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c index 1108665913e..9cf6e9bb017 100644 --- a/drivers/char/agp/intel-agp.c +++ b/drivers/char/agp/intel-agp.c @@ -2390,5 +2390,5 @@ static void __exit agp_intel_cleanup(void) module_init(agp_intel_init); module_exit(agp_intel_cleanup); -MODULE_AUTHOR("Dave Jones "); +MODULE_AUTHOR("Dave Jones "); MODULE_LICENSE("GPL and additional rights"); diff --git a/drivers/char/agp/nvidia-agp.c b/drivers/char/agp/nvidia-agp.c index 5bbed3d79db..16acee2de11 100644 --- a/drivers/char/agp/nvidia-agp.c +++ b/drivers/char/agp/nvidia-agp.c @@ -1,7 +1,7 @@ /* * Nvidia AGPGART routines. * Based upon a 2.4 agpgart diff by the folks from NVIDIA, and hacked up - * to work in 2.5 by Dave Jones + * to work in 2.5 by Dave Jones */ #include diff --git a/drivers/char/agp/via-agp.c b/drivers/char/agp/via-agp.c index 9f4d49e1b59..d3bd243867f 100644 --- a/drivers/char/agp/via-agp.c +++ b/drivers/char/agp/via-agp.c @@ -595,4 +595,4 @@ module_init(agp_via_init); module_exit(agp_via_cleanup); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Dave Jones "); +MODULE_AUTHOR("Dave Jones "); diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index e30bac141b2..f88bb3e21cd 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1,5 +1,5 @@ #!/usr/bin/perl -w -# (c) 2001, Dave Jones. (the file handling bit) +# (c) 2001, Dave Jones. (the file handling bit) # (c) 2005, Joel Schopp (the ugly bit) # (c) 2007, Andy Whitcroft (new conditions, test suite, etc) # Licensed under the terms of the GNU GPL License version 2 -- cgit v1.2.3-70-g09d2 From e9f95e637320efe1936b647308ddf4ec5b8e0311 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 21 Oct 2008 15:49:59 +0200 Subject: genirq: fix off by one and coding style Fix off-by-one in for_each_irq_desc_reverse(). Impact is near zero in practice, because nothing substantial wants to iterate down to IRQ#0 - but fix it nevertheless. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/irq.c | 4 ++-- include/linux/irqnr.h | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index ccf6c503fc3..d1d4dc52f64 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -36,7 +36,7 @@ void ack_bad_irq(unsigned int irq) } #ifdef CONFIG_X86_32 -# define irq_stats(x) (&per_cpu(irq_stat,x)) +# define irq_stats(x) (&per_cpu(irq_stat, x)) #else # define irq_stats(x) cpu_pda(x) #endif @@ -113,7 +113,7 @@ int show_interrupts(struct seq_file *p, void *v) if (i == 0) { seq_printf(p, " "); for_each_online_cpu(j) - seq_printf(p, "CPU%-8d",j); + seq_printf(p, "CPU%-8d", j); seq_putc(p, '\n'); } diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h index 3171ddc3b39..452c280c811 100644 --- a/include/linux/irqnr.h +++ b/include/linux/irqnr.h @@ -13,9 +13,9 @@ extern int nr_irqs; # define for_each_irq_desc(irq, desc) \ for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++) -# define for_each_irq_desc_reverse(irq, desc) \ - for (irq = nr_irqs -1, desc = irq_desc + (nr_irqs -1 ); \ - irq > 0; irq--, desc--) +# define for_each_irq_desc_reverse(irq, desc) \ + for (irq = nr_irqs - 1, desc = irq_desc + (nr_irqs - 1); \ + irq >= 0; irq--, desc--) #endif #define for_each_irq_nr(irq) \ -- cgit v1.2.3-70-g09d2 From c7d87d79d14cecab7a34dedf1b133377cf5a0203 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Thu, 16 Oct 2008 16:34:43 -0400 Subject: x86 allow modules to register idle notifiers needed if the i7300_idle driver is to be modular. Signed-off-by: Venkatesh Pallipadi Acked-by: Ingo Molnar Signed-off-by: Len Brown --- arch/x86/kernel/process_64.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index e12e0e4dd25..3e3d503eadc 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -62,6 +62,13 @@ void idle_notifier_register(struct notifier_block *n) { atomic_notifier_chain_register(&idle_notifier, n); } +EXPORT_SYMBOL_GPL(idle_notifier_register); + +void idle_notifier_unregister(struct notifier_block *n) +{ + atomic_notifier_chain_unregister(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(idle_notifier_unregister); void enter_idle(void) { -- cgit v1.2.3-70-g09d2 From 27471fdb32e77ecb92f09d4ac5757785b4dc33bc Mon Sep 17 00:00:00 2001 From: Andy Henroid Date: Thu, 9 Oct 2008 11:45:22 -0700 Subject: i7300_idle driver v1.55 The Intel 7300 Memory Controller supports dynamic throttling of memory which can be used to save power when system is idle. This driver does the memory throttling when all CPUs are idle on such a system. Refer to "Intel 7300 Memory Controller Hub (MCH)" datasheet for the config space description. Signed-off-by: Andy Henroid Signed-off-by: Len Brown Signed-off-by: Venkatesh Pallipadi --- MAINTAINERS | 6 + arch/x86/Kconfig | 2 + drivers/Makefile | 1 + drivers/dma/ioat_dma.c | 3 + drivers/idle/Kconfig | 16 ++ drivers/idle/Makefile | 2 + drivers/idle/i7300_idle.c | 674 ++++++++++++++++++++++++++++++++++++++++++++++ include/asm-x86/idle.h | 1 + include/linux/pci_ids.h | 1 + 9 files changed, 706 insertions(+) create mode 100644 drivers/idle/Kconfig create mode 100644 drivers/idle/Makefile create mode 100644 drivers/idle/i7300_idle.c (limited to 'arch/x86') diff --git a/MAINTAINERS b/MAINTAINERS index 8dae4555f10..43f71b0d2a2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2078,6 +2078,12 @@ L: linux-ide@vger.kernel.org L: linux-scsi@vger.kernel.org S: Orphan +IDLE-I7300 +P: Andy Henroid +M: andrew.d.henroid@intel.com +L: linux-pm@lists.linux-foundation.org +S: Supported + IEEE 1394 SUBSYSTEM (drivers/ieee1394) P: Ben Collins M: ben.collins@ubuntu.com diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ed92864d132..19cdfe1f237 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1536,6 +1536,8 @@ source "arch/x86/kernel/cpu/cpufreq/Kconfig" source "drivers/cpuidle/Kconfig" +source "drivers/idle/Kconfig" + endmenu diff --git a/drivers/Makefile b/drivers/Makefile index 2735bde7347..f443a8a9d46 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -82,6 +82,7 @@ obj-$(CONFIG_EISA) += eisa/ obj-y += lguest/ obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_CPU_IDLE) += cpuidle/ +obj-y += idle/ obj-$(CONFIG_MMC) += mmc/ obj-$(CONFIG_MEMSTICK) += memstick/ obj-$(CONFIG_NEW_LEDS) += leds/ diff --git a/drivers/dma/ioat_dma.c b/drivers/dma/ioat_dma.c index bc8c6e3470c..f8396cafa05 100644 --- a/drivers/dma/ioat_dma.c +++ b/drivers/dma/ioat_dma.c @@ -171,6 +171,9 @@ static int ioat_dma_enumerate_channels(struct ioatdma_device *device) xfercap_scale = readb(device->reg_base + IOAT_XFERCAP_OFFSET); xfercap = (xfercap_scale == 0 ? -1 : (1UL << xfercap_scale)); +#if CONFIG_I7300_IDLE_IOAT_CHANNEL + device->common.chancnt--; +#endif for (i = 0; i < device->common.chancnt; i++) { ioat_chan = kzalloc(sizeof(*ioat_chan), GFP_KERNEL); if (!ioat_chan) { diff --git a/drivers/idle/Kconfig b/drivers/idle/Kconfig new file mode 100644 index 00000000000..f5b26dd579e --- /dev/null +++ b/drivers/idle/Kconfig @@ -0,0 +1,16 @@ + +menu "Memory power savings" + +config I7300_IDLE_IOAT_CHANNEL + bool + +config I7300_IDLE + tristate "Intel chipset idle power saving driver" + select I7300_IDLE_IOAT_CHANNEL + depends on X86_64 + help + Enable idle power savings with certain Intel server chipsets. + The chipset must have I/O AT support, such as the Intel 7300. + The power savings depends on the type and quantity of DRAM devices. + +endmenu diff --git a/drivers/idle/Makefile b/drivers/idle/Makefile new file mode 100644 index 00000000000..5f68fc377e2 --- /dev/null +++ b/drivers/idle/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_I7300_IDLE) += i7300_idle.o + diff --git a/drivers/idle/i7300_idle.c b/drivers/idle/i7300_idle.c new file mode 100644 index 00000000000..59d1bbc3cd3 --- /dev/null +++ b/drivers/idle/i7300_idle.c @@ -0,0 +1,674 @@ +/* + * (C) Copyright 2008 Intel Corporation + * Authors: + * Andy Henroid + * Venkatesh Pallipadi + */ + +/* + * Save DIMM power on Intel 7300-based platforms when all CPUs/cores + * are idle, using the DIMM thermal throttling capability. + * + * This driver depends on the Intel integrated DMA controller (I/O AT). + * If the driver for I/O AT (drivers/dma/ioatdma*) is also enabled, + * this driver should work cooperatively. + */ + +/* #define DEBUG */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "../dma/ioatdma_hw.h" +#include "../dma/ioatdma_registers.h" + +#define I7300_IDLE_DRIVER_VERSION "1.55" +#define I7300_PRINT "i7300_idle:" + +static int debug; +module_param_named(debug, debug, uint, 0644); +MODULE_PARM_DESC(debug, "Enable debug printks in this driver"); + +#define dprintk(fmt, arg...) \ + do { if (debug) printk(KERN_INFO I7300_PRINT fmt, ##arg); } while (0) + +/* + * Value to set THRTLOW to when initiating throttling + * 0 = No throttling + * 1 = Throttle when > 4 activations per eval window (Maximum throttling) + * 2 = Throttle when > 8 activations + * 168 = Throttle when > 168 activations (Minimum throttling) + */ +#define MAX_THRTLWLIMIT 168 +static uint i7300_idle_thrtlowlm = 1; +module_param_named(thrtlwlimit, i7300_idle_thrtlowlm, uint, 0644); +MODULE_PARM_DESC(thrtlwlimit, + "Value for THRTLOWLM activation field " + "(0 = disable throttle, 1 = Max throttle, 168 = Min throttle)"); + +/* + * simple invocation and duration statistics + */ +static unsigned long total_starts; +static unsigned long total_us; + +#ifdef DEBUG +static unsigned long past_skip; +#endif + +static struct pci_dev *fbd_dev; + +static spinlock_t i7300_idle_lock; +static int i7300_idle_active; + +static u8 i7300_idle_thrtctl_saved; +static u8 i7300_idle_thrtlow_saved; +static u32 i7300_idle_mc_saved; + +static cpumask_t idle_cpumask; +static ktime_t start_ktime; +static unsigned long avg_idle_us; + +static struct dentry *debugfs_dir; + +/* Begin: I/O AT Helper routines */ + +#define IOAT_CHANBASE(ioat_ctl, chan) (ioat_ctl + 0x80 + 0x80 * chan) +/* Snoop control (disable snoops when coherency is not important) */ +#define IOAT_DESC_SADDR_SNP_CTL (1UL << 1) +#define IOAT_DESC_DADDR_SNP_CTL (1UL << 2) + +static struct pci_dev *ioat_dev; +static struct ioat_dma_descriptor *ioat_desc; /* I/O AT desc & data (1 page) */ +static unsigned long ioat_desc_phys; +static u8 *ioat_iomap; /* I/O AT memory-mapped control regs (aka CB_BAR) */ +static u8 *ioat_chanbase; + +/* Start I/O AT memory copy */ +static int i7300_idle_ioat_start(void) +{ + u32 err; + /* Clear error (due to circular descriptor pointer) */ + err = readl(ioat_chanbase + IOAT_CHANERR_OFFSET); + if (err) + writel(err, ioat_chanbase + IOAT_CHANERR_OFFSET); + + writeb(IOAT_CHANCMD_START, ioat_chanbase + IOAT1_CHANCMD_OFFSET); + return 0; +} + +/* Stop I/O AT memory copy */ +static void i7300_idle_ioat_stop(void) +{ + int i; + u8 sts; + + for (i = 0; i < 5; i++) { + writeb(IOAT_CHANCMD_RESET, + ioat_chanbase + IOAT1_CHANCMD_OFFSET); + + udelay(10); + + sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) & + IOAT_CHANSTS_DMA_TRANSFER_STATUS; + + if (sts != IOAT_CHANSTS_DMA_TRANSFER_STATUS_ACTIVE) + break; + + } + + if (i == 5) + dprintk("failed to suspend+reset I/O AT after 5 retries\n"); + +} + +/* Test I/O AT by copying 1024 byte from 2k to 1k */ +static int __init i7300_idle_ioat_selftest(u8 *ctl, + struct ioat_dma_descriptor *desc, unsigned long desc_phys) +{ + u64 chan_sts; + + memset(desc, 0, 2048); + memset((u8 *) desc + 2048, 0xab, 1024); + + desc[0].size = 1024; + desc[0].ctl = 0; + desc[0].src_addr = desc_phys + 2048; + desc[0].dst_addr = desc_phys + 1024; + desc[0].next = 0; + + writeb(IOAT_CHANCMD_RESET, ioat_chanbase + IOAT1_CHANCMD_OFFSET); + writeb(IOAT_CHANCMD_START, ioat_chanbase + IOAT1_CHANCMD_OFFSET); + + udelay(1000); + + chan_sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) & + IOAT_CHANSTS_DMA_TRANSFER_STATUS; + + if (chan_sts != IOAT_CHANSTS_DMA_TRANSFER_STATUS_DONE) { + /* Not complete, reset the channel */ + writeb(IOAT_CHANCMD_RESET, + ioat_chanbase + IOAT1_CHANCMD_OFFSET); + return -1; + } + + if (*(u32 *) ((u8 *) desc + 3068) != 0xabababab || + *(u32 *) ((u8 *) desc + 2044) != 0xabababab) { + dprintk("Data values src 0x%x, dest 0x%x, memset 0x%x\n", + *(u32 *) ((u8 *) desc + 2048), + *(u32 *) ((u8 *) desc + 1024), + *(u32 *) ((u8 *) desc + 3072)); + return -1; + } + return 0; +} + +static struct device dummy_dma_dev = { + .bus_id = "fallback device", + .coherent_dma_mask = DMA_64BIT_MASK, + .dma_mask = &dummy_dma_dev.coherent_dma_mask, +}; + +/* Setup and initialize I/O AT */ +/* This driver needs I/O AT as the throttling takes effect only when there is + * some memory activity. We use I/O AT to set up a dummy copy, while all CPUs + * go idle and memory is throttled. + */ +static int __init i7300_idle_ioat_init(void) +{ + u8 ver, chan_count, ioat_chan; + u16 chan_ctl; + + ioat_iomap = (u8 *) ioremap_nocache(pci_resource_start(ioat_dev, 0), + pci_resource_len(ioat_dev, 0)); + + if (!ioat_iomap) { + printk(KERN_ERR I7300_PRINT "failed to map I/O AT registers\n"); + goto err_ret; + } + + ver = readb(ioat_iomap + IOAT_VER_OFFSET); + if (ver != IOAT_VER_1_2) { + printk(KERN_ERR I7300_PRINT "unknown I/O AT version (%u.%u)\n", + ver >> 4, ver & 0xf); + goto err_unmap; + } + + chan_count = readb(ioat_iomap + IOAT_CHANCNT_OFFSET); + if (!chan_count) { + printk(KERN_ERR I7300_PRINT "unexpected # of I/O AT channels " + "(%u)\n", + chan_count); + goto err_unmap; + } + + ioat_chan = chan_count - 1; + ioat_chanbase = IOAT_CHANBASE(ioat_iomap, ioat_chan); + + chan_ctl = readw(ioat_chanbase + IOAT_CHANCTRL_OFFSET); + if (chan_ctl & IOAT_CHANCTRL_CHANNEL_IN_USE) { + printk(KERN_ERR I7300_PRINT "channel %d in use\n", ioat_chan); + goto err_unmap; + } + + writew(IOAT_CHANCTRL_CHANNEL_IN_USE, + ioat_chanbase + IOAT_CHANCTRL_OFFSET); + + ioat_desc = (struct ioat_dma_descriptor *)dma_alloc_coherent( + &dummy_dma_dev, 4096, + (dma_addr_t *)&ioat_desc_phys, GFP_KERNEL); + if (!ioat_desc) { + printk(KERN_ERR I7300_PRINT "failed to allocate I/O AT desc\n"); + goto err_mark_unused; + } + + writel(ioat_desc_phys & 0xffffffffUL, + ioat_chanbase + IOAT1_CHAINADDR_OFFSET_LOW); + writel(ioat_desc_phys >> 32, + ioat_chanbase + IOAT1_CHAINADDR_OFFSET_HIGH); + + if (i7300_idle_ioat_selftest(ioat_iomap, ioat_desc, ioat_desc_phys)) { + printk(KERN_ERR I7300_PRINT "I/O AT self-test failed\n"); + goto err_free; + } + + /* Setup circular I/O AT descriptor chain */ + ioat_desc[0].ctl = IOAT_DESC_SADDR_SNP_CTL | IOAT_DESC_DADDR_SNP_CTL; + ioat_desc[0].src_addr = ioat_desc_phys + 2048; + ioat_desc[0].dst_addr = ioat_desc_phys + 3072; + ioat_desc[0].size = 128; + ioat_desc[0].next = ioat_desc_phys + sizeof(struct ioat_dma_descriptor); + + ioat_desc[1].ctl = ioat_desc[0].ctl; + ioat_desc[1].src_addr = ioat_desc[0].src_addr; + ioat_desc[1].dst_addr = ioat_desc[0].dst_addr; + ioat_desc[1].size = ioat_desc[0].size; + ioat_desc[1].next = ioat_desc_phys; + + return 0; + +err_free: + dma_free_coherent(&dummy_dma_dev, 4096, (void *)ioat_desc, 0); +err_mark_unused: + writew(0, ioat_chanbase + IOAT_CHANCTRL_OFFSET); +err_unmap: + iounmap(ioat_iomap); +err_ret: + return -ENODEV; +} + +/* Cleanup I/O AT */ +static void __exit i7300_idle_ioat_exit(void) +{ + int i; + u64 chan_sts; + + i7300_idle_ioat_stop(); + + /* Wait for a while for the channel to halt before releasing */ + for (i = 0; i < 10; i++) { + writeb(IOAT_CHANCMD_RESET, + ioat_chanbase + IOAT1_CHANCMD_OFFSET); + + chan_sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) & + IOAT_CHANSTS_DMA_TRANSFER_STATUS; + + if (chan_sts != IOAT_CHANSTS_DMA_TRANSFER_STATUS_ACTIVE) { + writew(0, ioat_chanbase + IOAT_CHANCTRL_OFFSET); + break; + } + udelay(1000); + } + + chan_sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) & + IOAT_CHANSTS_DMA_TRANSFER_STATUS; + + /* + * We tried to reset multiple times. If IO A/T channel is still active + * flag an error and return without cleanup. Memory leak is better + * than random corruption in that extreme error situation. + */ + if (chan_sts == IOAT_CHANSTS_DMA_TRANSFER_STATUS_ACTIVE) { + printk(KERN_ERR I7300_PRINT "Unable to stop IO A/T channels." + " Not freeing resources\n"); + return; + } + + dma_free_coherent(&dummy_dma_dev, 4096, (void *)ioat_desc, 0); + iounmap(ioat_iomap); +} + +/* End: I/O AT Helper routines */ + +#define DIMM_THRTLOW 0x64 +#define DIMM_THRTCTL 0x67 +#define DIMM_THRTCTL_THRMHUNT (1UL << 0) +#define DIMM_MC 0x40 +#define DIMM_GTW_MODE (1UL << 17) +#define DIMM_GBLACT 0x60 + +/* + * Keep track of an exponential-decaying average of recent idle durations. + * The latest duration gets DURATION_WEIGHT_PCT percentage weight + * in this average, with the old average getting the remaining weight. + * + * High weights emphasize recent history, low weights include long history. + */ +#define DURATION_WEIGHT_PCT 55 + +/* + * When the decaying average of recent durations or the predicted duration + * of the next timer interrupt is shorter than duration_threshold, the + * driver will decline to throttle. + */ +#define DURATION_THRESHOLD_US 100 + + +/* Store DIMM thermal throttle configuration */ +static int i7300_idle_thrt_save(void) +{ + u32 new_mc_val; + u8 gblactlm; + + pci_read_config_byte(fbd_dev, DIMM_THRTCTL, &i7300_idle_thrtctl_saved); + pci_read_config_byte(fbd_dev, DIMM_THRTLOW, &i7300_idle_thrtlow_saved); + pci_read_config_dword(fbd_dev, DIMM_MC, &i7300_idle_mc_saved); + /* + * Make sure we have Global Throttling Window Mode set to have a + * "short" window. This (mostly) works around an issue where + * throttling persists until the end of the global throttling window + * size. On the tested system, this was resulting in a maximum of + * 64 ms to exit throttling (average 32 ms). The actual numbers + * depends on system frequencies. Setting the short window reduces + * this by a factor of 4096. + * + * We will only do this only if the system is set for + * unlimited-activations while in open-loop throttling (i.e., when + * Global Activation Throttle Limit is zero). + */ + pci_read_config_byte(fbd_dev, DIMM_GBLACT, &gblactlm); + dprintk("thrtctl_saved = 0x%02x, thrtlow_saved = 0x%02x\n", + i7300_idle_thrtctl_saved, + i7300_idle_thrtlow_saved); + dprintk("mc_saved = 0x%08x, gblactlm = 0x%02x\n", + i7300_idle_mc_saved, + gblactlm); + if (gblactlm == 0) { + new_mc_val = i7300_idle_mc_saved | DIMM_GTW_MODE; + pci_write_config_dword(fbd_dev, DIMM_MC, new_mc_val); + return 0; + } else { + dprintk("could not set GTW_MODE = 1 (OLTT enabled)\n"); + return -ENODEV; + } +} + +/* Restore DIMM thermal throttle configuration */ +static void i7300_idle_thrt_restore(void) +{ + pci_write_config_dword(fbd_dev, DIMM_MC, i7300_idle_mc_saved); + pci_write_config_byte(fbd_dev, DIMM_THRTLOW, i7300_idle_thrtlow_saved); + pci_write_config_byte(fbd_dev, DIMM_THRTCTL, i7300_idle_thrtctl_saved); +} + +/* Enable DIMM thermal throttling */ +static void i7300_idle_start(void) +{ + u8 new_ctl; + u8 limit; + + new_ctl = i7300_idle_thrtctl_saved & ~DIMM_THRTCTL_THRMHUNT; + pci_write_config_byte(fbd_dev, DIMM_THRTCTL, new_ctl); + + limit = i7300_idle_thrtlowlm; + if (unlikely(limit > MAX_THRTLWLIMIT)) + limit = MAX_THRTLWLIMIT; + + pci_write_config_byte(fbd_dev, DIMM_THRTLOW, limit); + + new_ctl = i7300_idle_thrtctl_saved | DIMM_THRTCTL_THRMHUNT; + pci_write_config_byte(fbd_dev, DIMM_THRTCTL, new_ctl); +} + +/* Disable DIMM thermal throttling */ +static void i7300_idle_stop(void) +{ + u8 new_ctl; + u8 got_ctl; + + new_ctl = i7300_idle_thrtctl_saved & ~DIMM_THRTCTL_THRMHUNT; + pci_write_config_byte(fbd_dev, DIMM_THRTCTL, new_ctl); + + pci_write_config_byte(fbd_dev, DIMM_THRTLOW, i7300_idle_thrtlow_saved); + pci_write_config_byte(fbd_dev, DIMM_THRTCTL, i7300_idle_thrtctl_saved); + pci_read_config_byte(fbd_dev, DIMM_THRTCTL, &got_ctl); + WARN_ON_ONCE(got_ctl != i7300_idle_thrtctl_saved); +} + + +/* + * i7300_avg_duration_check() + * return 0 if the decaying average of recent idle durations is + * more than DURATION_THRESHOLD_US + */ +static int i7300_avg_duration_check(void) +{ + if (avg_idle_us >= DURATION_THRESHOLD_US) + return 0; + +#ifdef DEBUG + past_skip++; +#endif + return 1; +} + +/* Idle notifier to look at idle CPUs */ +static int i7300_idle_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + unsigned long flags; + ktime_t now_ktime; + static ktime_t idle_begin_time; + static int time_init = 1; + + if (!i7300_idle_thrtlowlm) + return 0; + + if (unlikely(time_init)) { + time_init = 0; + idle_begin_time = ktime_get(); + } + + spin_lock_irqsave(&i7300_idle_lock, flags); + if (val == IDLE_START) { + + cpu_set(smp_processor_id(), idle_cpumask); + + if (cpus_weight(idle_cpumask) != num_online_cpus()) + goto end; + + now_ktime = ktime_get(); + idle_begin_time = now_ktime; + + if (i7300_avg_duration_check()) + goto end; + + i7300_idle_active = 1; + total_starts++; + start_ktime = now_ktime; + + i7300_idle_start(); + i7300_idle_ioat_start(); + + } else if (val == IDLE_END) { + cpu_clear(smp_processor_id(), idle_cpumask); + if (cpus_weight(idle_cpumask) == (num_online_cpus() - 1)) { + /* First CPU coming out of idle */ + u64 idle_duration_us; + + now_ktime = ktime_get(); + + idle_duration_us = ktime_to_us(ktime_sub + (now_ktime, idle_begin_time)); + + avg_idle_us = + ((100 - DURATION_WEIGHT_PCT) * avg_idle_us + + DURATION_WEIGHT_PCT * idle_duration_us) / 100; + + if (i7300_idle_active) { + ktime_t idle_ktime; + + idle_ktime = ktime_sub(now_ktime, start_ktime); + total_us += ktime_to_us(idle_ktime); + + i7300_idle_ioat_stop(); + i7300_idle_stop(); + i7300_idle_active = 0; + } + } + } +end: + spin_unlock_irqrestore(&i7300_idle_lock, flags); + return 0; +} + +static struct notifier_block i7300_idle_nb = { + .notifier_call = i7300_idle_notifier, +}; + +/* + * I/O AT controls (PCI bus 0 device 8 function 0) + * DIMM controls (PCI bus 0 device 16 function 1) + */ +#define IOAT_BUS 0 +#define IOAT_DEVFN PCI_DEVFN(8, 0) +#define MEMCTL_BUS 0 +#define MEMCTL_DEVFN PCI_DEVFN(16, 1) + +struct fbd_ioat { + unsigned int vendor; + unsigned int ioat_dev; +}; + +/* + * The i5000 chip-set has the same hooks as the i7300 + * but support is disabled by default because this driver + * has not been validated on that platform. + */ +#define SUPPORT_I5000 0 + +static const struct fbd_ioat fbd_ioat_list[] = { + {PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_CNB}, +#if SUPPORT_I5000 + {PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT}, +#endif + {0, 0} +}; + +/* table of devices that work with this driver */ +static const struct pci_device_id pci_tbl[] = { + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_FBD_CNB) }, +#if SUPPORT_I5000 + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_5000_ERR) }, +#endif + { } /* Terminating entry */ +}; + +MODULE_DEVICE_TABLE(pci, pci_tbl); + +/* Check for known platforms with I/O-AT */ +static int __init i7300_idle_platform_probe(void) +{ + int i; + + fbd_dev = pci_get_bus_and_slot(MEMCTL_BUS, MEMCTL_DEVFN); + if (!fbd_dev) + return -ENODEV; + + for (i = 0; pci_tbl[i].vendor != 0; i++) { + if (fbd_dev->vendor == pci_tbl[i].vendor && + fbd_dev->device == pci_tbl[i].device) { + break; + } + } + if (pci_tbl[i].vendor == 0) + return -ENODEV; + + ioat_dev = pci_get_bus_and_slot(IOAT_BUS, IOAT_DEVFN); + if (!ioat_dev) + return -ENODEV; + + for (i = 0; fbd_ioat_list[i].vendor != 0; i++) { + if (ioat_dev->vendor == fbd_ioat_list[i].vendor && + ioat_dev->device == fbd_ioat_list[i].ioat_dev) { + return 0; + } + } + return -ENODEV; +} + +int stats_open_generic(struct inode *inode, struct file *fp) +{ + fp->private_data = inode->i_private; + return 0; +} + +static ssize_t stats_read_ul(struct file *fp, char __user *ubuf, size_t count, + loff_t *off) +{ + unsigned long *p = fp->private_data; + char buf[32]; + int len; + + len = snprintf(buf, 32, "%lu\n", *p); + return simple_read_from_buffer(ubuf, count, off, buf, len); +} + +static const struct file_operations idle_fops = { + .open = stats_open_generic, + .read = stats_read_ul, +}; + +struct debugfs_file_info { + void *ptr; + char name[32]; + struct dentry *file; +} debugfs_file_list[] = { + {&total_starts, "total_starts", NULL}, + {&total_us, "total_us", NULL}, +#ifdef DEBUG + {&past_skip, "past_skip", NULL}, +#endif + {NULL, "", NULL} + }; + +static int __init i7300_idle_init(void) +{ + spin_lock_init(&i7300_idle_lock); + cpus_clear(idle_cpumask); + total_us = 0; + + if (i7300_idle_platform_probe()) + return -ENODEV; + + if (i7300_idle_thrt_save()) + return -ENODEV; + + if (i7300_idle_ioat_init()) + return -ENODEV; + + debugfs_dir = debugfs_create_dir("i7300_idle", NULL); + if (debugfs_dir) { + int i = 0; + + while (debugfs_file_list[i].ptr != NULL) { + debugfs_file_list[i].file = debugfs_create_file( + debugfs_file_list[i].name, + S_IRUSR, + debugfs_dir, + debugfs_file_list[i].ptr, + &idle_fops); + i++; + } + } + + idle_notifier_register(&i7300_idle_nb); + + printk(KERN_INFO "i7300_idle: loaded v%s\n", I7300_IDLE_DRIVER_VERSION); + return 0; +} + +static void __exit i7300_idle_exit(void) +{ + idle_notifier_unregister(&i7300_idle_nb); + + if (debugfs_dir) { + int i = 0; + + while (debugfs_file_list[i].file != NULL) { + debugfs_remove(debugfs_file_list[i].file); + i++; + } + + debugfs_remove(debugfs_dir); + } + i7300_idle_thrt_restore(); + i7300_idle_ioat_exit(); +} + +module_init(i7300_idle_init); +module_exit(i7300_idle_exit); + +MODULE_AUTHOR("Andy Henroid "); +MODULE_DESCRIPTION("Intel Chipset DIMM Idle Power Saving Driver v" + I7300_IDLE_DRIVER_VERSION); +MODULE_LICENSE("GPL"); diff --git a/include/asm-x86/idle.h b/include/asm-x86/idle.h index cbb64912361..54ce018d4b6 100644 --- a/include/asm-x86/idle.h +++ b/include/asm-x86/idle.h @@ -6,6 +6,7 @@ struct notifier_block; void idle_notifier_register(struct notifier_block *n); +void idle_notifier_unregister(struct notifier_block *n); void enter_idle(void); void exit_idle(void); diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index f1624b39675..efb786d11f2 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2422,6 +2422,7 @@ #define PCI_DEVICE_ID_INTEL_MCH_PC1 0x359a #define PCI_DEVICE_ID_INTEL_E7525_MCH 0x359e #define PCI_DEVICE_ID_INTEL_IOAT_CNB 0x360b +#define PCI_DEVICE_ID_INTEL_FBD_CNB 0x360c #define PCI_DEVICE_ID_INTEL_ICH10_0 0x3a14 #define PCI_DEVICE_ID_INTEL_ICH10_1 0x3a16 #define PCI_DEVICE_ID_INTEL_ICH10_2 0x3a18 -- cgit v1.2.3-70-g09d2 From 8bcad30f2e6d4c20f7e71d2e2ac77acc0f0931e5 Mon Sep 17 00:00:00 2001 From: roel kluin Date: Tue, 21 Oct 2008 19:49:09 -0400 Subject: x86: make variables static These variables are only used in their source files, so make them static. Signed-off-by: Roel Kluin Signed-off-by: Ingo Molnar --- arch/x86/boot/video-bios.c | 4 ++-- arch/x86/boot/video-vesa.c | 4 ++-- arch/x86/kernel/xsave.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c index 49f26aaaebc..3fa979c9c36 100644 --- a/arch/x86/boot/video-bios.c +++ b/arch/x86/boot/video-bios.c @@ -17,7 +17,7 @@ #include "boot.h" #include "video.h" -__videocard video_bios; +static __videocard video_bios; /* Set a conventional BIOS mode */ static int set_bios_mode(u8 mode); @@ -119,7 +119,7 @@ static int bios_probe(void) return nmodes; } -__videocard video_bios = +static __videocard video_bios = { .card_name = "BIOS", .probe = bios_probe, diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c index 99b3079dc6a..75115849af3 100644 --- a/arch/x86/boot/video-vesa.c +++ b/arch/x86/boot/video-vesa.c @@ -20,7 +20,7 @@ static struct vesa_general_info vginfo; static struct vesa_mode_info vminfo; -__videocard video_vesa; +static __videocard video_vesa; #ifndef _WAKEUP static void vesa_store_mode_params_graphics(void); @@ -293,7 +293,7 @@ void vesa_store_edid(void) #endif /* not _WAKEUP */ -__videocard video_vesa = +static __videocard video_vesa = { .card_name = "VESA", .probe = vesa_probe, diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 9abac8a9d82..b13acb75e82 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -248,7 +248,7 @@ clear: * This will be saved when ever the FP and extended state context is * saved on the user stack during the signal handler delivery to the user. */ -void prepare_fx_sw_frame(void) +static void prepare_fx_sw_frame(void) { int size_extended = (xstate_size - sizeof(struct i387_fxsave_struct)) + FP_XSTATE_MAGIC2_SIZE; -- cgit v1.2.3-70-g09d2 From b0f209898f1a177bd503d49215b8c6628797a81c Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Tue, 21 Oct 2008 14:09:51 -0500 Subject: x86, uv: use consistent names for region size and conherence id on x86 and ia64 Use consistent names for region size and conherence id on x86 and ia64. The SGI xp drivers are used on both ia64 and x86. Using the same names (sn_coherency_id, sn_region_size) simplies the driver code. Signed-off-by: Russ Anderson Signed-off-by: Ingo Molnar --- arch/x86/kernel/bios_uv.c | 8 ++++---- arch/x86/kernel/genx2apic_uv_x.c | 4 ++-- include/asm-x86/uv/bios.h | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index f0dfe6f17e7..7cefb7170e7 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -69,10 +69,10 @@ s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, long sn_partition_id; EXPORT_SYMBOL_GPL(sn_partition_id); -long uv_coherency_id; -EXPORT_SYMBOL_GPL(uv_coherency_id); -long uv_region_size; -EXPORT_SYMBOL_GPL(uv_region_size); +long sn_coherency_id; +EXPORT_SYMBOL_GPL(sn_coherency_id); +long sn_region_size; +EXPORT_SYMBOL_GPL(sn_region_size); int uv_type; diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index bfd532843df..6cf35c8bd63 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -429,7 +429,7 @@ void __init uv_system_init(void) uv_bios_init(); uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, - &uv_coherency_id, &uv_region_size); + &sn_coherency_id, &sn_region_size); uv_rtc_init(); for_each_present_cpu(cpu) { @@ -451,7 +451,7 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; - uv_cpu_hub_info(cpu)->coherency_domain_number = uv_coherency_id; + uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; uv_node_to_blade[nid] = blade; uv_cpu_to_blade[cpu] = blade; max_pnode = max(pnode, max_pnode); diff --git a/include/asm-x86/uv/bios.h b/include/asm-x86/uv/bios.h index 215f1969c26..7b3d7022c63 100644 --- a/include/asm-x86/uv/bios.h +++ b/include/asm-x86/uv/bios.h @@ -85,9 +85,9 @@ extern void uv_bios_init(void); extern int uv_type; extern long sn_partition_id; -extern long uv_coherency_id; -extern long uv_region_size; -#define partition_coherence_id() (uv_coherency_id) +extern long sn_coherency_id; +extern long sn_region_size; +#define partition_coherence_id() (sn_coherency_id) extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */ -- cgit v1.2.3-70-g09d2 From 2bfef69d9e8cc056aa4dbc13f2136747340b4515 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 15 Oct 2008 11:51:53 +0200 Subject: x86: SB600: skip IRQ0 override if it is not routed to INT2 of IOAPIC Impact: fix hung bootup and other misbehavior on certain laptops On some more HP laptops BIOS reports an IRQ0 override but the SB600 chipset is configured such that timer interrupts go to INT0 of IOAPIC. Check IRQ0 routing and if it is routed to INT0 of IOAPIC skip the timer override. See following bug reports: http://bugzilla.kernel.org/show_bug.cgi?id=11715 http://bugzilla.kernel.org/show_bug.cgi?id=11516 Signed-off-by: Andreas Herrmann Cc: Signed-off-by: Ingo Molnar --- arch/x86/kernel/early-quirks.c | 55 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 733c4f8d42e..3ce029ffaa5 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -95,7 +95,8 @@ static void __init nvidia_bugs(int num, int slot, int func) } -static u32 ati_ixp4x0_rev(int num, int slot, int func) +#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) +static u32 __init ati_ixp4x0_rev(int num, int slot, int func) { u32 d; u8 b; @@ -115,7 +116,6 @@ static u32 ati_ixp4x0_rev(int num, int slot, int func) static void __init ati_bugs(int num, int slot, int func) { -#if defined(CONFIG_ACPI) && defined (CONFIG_X86_IO_APIC) u32 d; u8 b; @@ -138,9 +138,56 @@ static void __init ati_bugs(int num, int slot, int func) printk(KERN_INFO "If you got timer trouble " "try acpi_use_timer_override\n"); } -#endif } +static u32 __init ati_sbx00_rev(int num, int slot, int func) +{ + u32 old, d; + + d = read_pci_config(num, slot, func, 0x70); + old = d; + d &= ~(1<<8); + write_pci_config(num, slot, func, 0x70, d); + d = read_pci_config(num, slot, func, 0x8); + d &= 0xff; + write_pci_config(num, slot, func, 0x70, old); + + return d; +} + +static void __init ati_bugs_contd(int num, int slot, int func) +{ + u32 d, rev; + + if (acpi_use_timer_override) + return; + + rev = ati_sbx00_rev(num, slot, func); + if (rev > 0x13) + return; + + /* check for IRQ0 interrupt swap */ + d = read_pci_config(num, slot, func, 0x64); + if (!(d & (1<<14))) + acpi_skip_timer_override = 1; + + if (acpi_skip_timer_override) { + printk(KERN_INFO "SB600 revision 0x%x\n", rev); + printk(KERN_INFO "Ignoring ACPI timer override.\n"); + printk(KERN_INFO "If you got timer trouble " + "try acpi_use_timer_override\n"); + } +} +#else +static void __init ati_bugs(int num, int slot, int func) +{ +} + +static void __init ati_bugs_contd(int num, int slot, int func) +{ +} +#endif + #ifdef CONFIG_DMAR static void __init intel_g33_dmar(int num, int slot, int func) { @@ -176,6 +223,8 @@ static struct chipset early_qrk[] __initdata = { PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config }, { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS, PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs }, + { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, + PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd }, #ifdef CONFIG_DMAR { PCI_VENDOR_ID_INTEL, 0x29c0, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, intel_g33_dmar }, -- cgit v1.2.3-70-g09d2 From d2f6f7aeee890df445be29a60e34925ec15f620c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 21 Oct 2008 22:45:22 +0200 Subject: MCE: Don't run 32bit machine checks with interrupts on Running machine checks with interrupt on is a extremly bad idea. The machine check handler only runs when the system is broken and needs to finish as quickly as possible. Remove the respective bogus post 2.6.27 regression and call the machine check vector directly again. This removes only code. Signed-off-by: Andi Kleen [Cherry-picked from x86/mce] Signed-off-by: Thomas Gleixner --- arch/x86/kernel/entry_32.S | 2 +- arch/x86/kernel/traps.c | 8 -------- 2 files changed, 1 insertion(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index c356423a602..dd65143941a 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1024,7 +1024,7 @@ ENTRY(machine_check) RING0_INT_FRAME pushl $0 CFI_ADJUST_CFA_OFFSET 4 - pushl $do_machine_check + pushl machine_check_vector CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index e062974cce3..04d242ab016 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -931,14 +931,6 @@ do_device_not_available(struct pt_regs *regs, long error) } #ifdef CONFIG_X86_32 -#ifdef CONFIG_X86_MCE -dotraplinkage void __kprobes do_machine_check(struct pt_regs *regs, long error) -{ - conditional_sti(regs); - machine_check_vector(regs, error); -} -#endif - dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) { siginfo_t info; -- cgit v1.2.3-70-g09d2 From 9e899816d126cc6f7d405c349f65363214fe7399 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Oct 2008 12:33:16 +0200 Subject: x86, mm: enable GBPAGES option by default DIRECT_GBPAGES was under DEBUG_KERNEL && EXPERIMENTAL and disabled by default. Turn it on by default and put it under EMBEDDED. Signed-off-by: Nick Piggin Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 9 +++++++++ arch/x86/Kconfig.debug | 12 ------------ 2 files changed, 9 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5b9b12321ad..c00aefcb47d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -946,6 +946,15 @@ config X86_PAE config ARCH_PHYS_ADDR_T_64BIT def_bool X86_64 || X86_PAE +config DIRECT_GBPAGES + bool "Enable 1GB pages for kernel pagetables" if EMBEDDED + default y + depends on X86_64 + help + Allow the kernel linear mapping to use 1GB pages on CPUs that + support it. This can improve the kernel's performance a tiny bit by + reducing TLB pressure. If in doubt, say "Y". + # Common NUMA Features config NUMA bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 2a3dfbd5e67..567fe543e09 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -114,18 +114,6 @@ config DEBUG_RODATA data. This is recommended so that we can catch kernel bugs sooner. If in doubt, say "Y". -config DIRECT_GBPAGES - bool "Enable gbpages-mapped kernel pagetables" - depends on DEBUG_KERNEL && EXPERIMENTAL && X86_64 - help - Enable gigabyte pages support (if the CPU supports it). This can - improve the kernel's performance a tiny bit by reducing TLB - pressure. - - This is experimental code. - - If in doubt, say "N". - config DEBUG_RODATA_TEST bool "Testcase for the DEBUG_RODATA feature" depends on DEBUG_RODATA -- cgit v1.2.3-70-g09d2 From cf52ebedba77ee494b495dedd3a1f55944611275 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Fri, 17 Oct 2008 17:00:13 -0400 Subject: x86, kexec: fix hang on i386 when panic occurs while console_sem is held There's a corner case in 32 bit x86 kdump at the moment. When the box panics via nmi, we call bust_spinlocks(1) to disable sensitivity to the console_sem (allowing us to print to the console in all cases), but we don't call crash_kexec, until after we call bust_spinlocks(0), which re-enables console_sem sensitivity. The result is that, if we get an nmi while the console_sem is held and kdump is configured, and we try to print something to the console during kdump shutdown (which we often do) we deadlock the box. The fix is to simply do what 64 bit die_nmi does which is to not call bust_spinlocks(0) until after we call crash_kexec. Patch below tested successfully by me. Signed-off-by: Neil Horman Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 1a78180f08d..b3614752197 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -405,7 +405,6 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic) panic("Non maskable interrupt"); console_silent(); spin_unlock(&nmi_print_lock); - bust_spinlocks(0); /* * If we are in kernel we are probably nested up pretty bad @@ -416,6 +415,7 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic) crash_kexec(regs); } + bust_spinlocks(0); do_exit(SIGSEGV); } -- cgit v1.2.3-70-g09d2 From b4b8f87bf4958cbad620654efc0882ac46c19846 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Wed, 22 Oct 2008 12:00:08 +0200 Subject: i386, dumpstack: move crash_kexec before bust_spinlocks(0) in oops_end crash_kexec should not be called with console_sem held. Move the call before bust_spinlocks(0) in oops_end to avoid the problem. Signed-off-by: Alexander van Heukelum Acked-by: "Neil Horman" Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index b3614752197..5493d31be4e 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -309,6 +309,9 @@ unsigned __kprobes long oops_begin(void) void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) { + if (regs && kexec_should_crash(current)) + crash_kexec(regs); + bust_spinlocks(0); die_owner = -1; add_taint(TAINT_DIE); @@ -318,8 +321,6 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) if (!regs) return; - if (kexec_should_crash(current)) - crash_kexec(regs); if (in_interrupt()) panic("Fatal exception in interrupt"); if (panic_on_oops) -- cgit v1.2.3-70-g09d2 From 874d93d11823b2b861addac6a5dc31162e924ab2 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Wed, 22 Oct 2008 12:00:09 +0200 Subject: x86, dumpstack: let signr=0 signal no do_exit Change oops_end such that signr=0 signals that do_exit is not to be called. Currently, each use of __die is soon followed by a call to oops_end and 'regs' is set to NULL if oops_end is expected not to call do_exit. Change all such pairs to set signr=0 instead. On x86_64 oops_end is used 'bare' in die_nmi; use signr=0 instead of regs=NULL there, too. Signed-off-by: Alexander van Heukelum Acked-by: Neil Horman Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 7 ++++--- arch/x86/kernel/dumpstack_64.c | 9 +++++---- arch/x86/mm/fault.c | 11 +++++++---- 3 files changed, 16 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 5493d31be4e..7c22f99f0ef 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -318,7 +318,7 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) __raw_spin_unlock(&die_lock); raw_local_irq_restore(flags); - if (!regs) + if (!signr) return; if (in_interrupt()) @@ -371,17 +371,18 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) void die(const char *str, struct pt_regs *regs, long err) { unsigned long flags = oops_begin(); + int sig = SIGSEGV; if (die_nest_count < 3) { report_bug(regs->ip, regs); if (__die(str, regs, err)) - regs = NULL; + sig = 0; } else { printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); } - oops_end(flags, regs, SIGSEGV); + oops_end(flags, regs, sig); } static DEFINE_SPINLOCK(nmi_print_lock); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 96a5db7da8a..ffefea611ba 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -465,7 +465,7 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) /* Nest count reaches zero, release the lock. */ __raw_spin_unlock(&die_lock); raw_local_irq_restore(flags); - if (!regs) { + if (!signr) { oops_exit(); return; } @@ -509,13 +509,14 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) void die(const char *str, struct pt_regs *regs, long err) { unsigned long flags = oops_begin(); + int sig = SIGSEGV; if (!user_mode(regs)) report_bug(regs->ip, regs); if (__die(str, regs, err)) - regs = NULL; - oops_end(flags, regs, SIGSEGV); + sig = 0; + oops_end(flags, regs, sig); } notrace __kprobes void @@ -539,7 +540,7 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic) crash_kexec(regs); if (do_panic || panic_on_oops) panic("Non maskable interrupt"); - oops_end(flags, NULL, SIGBUS); + oops_end(flags, regs, 0); nmi_exit(); local_irq_enable(); do_exit(SIGBUS); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 31e8730fa24..20ef272c412 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -413,6 +413,7 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, unsigned long error_code) { unsigned long flags = oops_begin(); + int sig = SIGKILL; struct task_struct *tsk; printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", @@ -423,8 +424,8 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, tsk->thread.trap_no = 14; tsk->thread.error_code = error_code; if (__die("Bad pagetable", regs, error_code)) - regs = NULL; - oops_end(flags, regs, SIGKILL); + sig = 0; + oops_end(flags, regs, sig); } #endif @@ -590,6 +591,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) int fault; #ifdef CONFIG_X86_64 unsigned long flags; + int sig; #endif tsk = current; @@ -849,11 +851,12 @@ no_context: bust_spinlocks(0); do_exit(SIGKILL); #else + sig = SIGKILL; if (__die("Oops", regs, error_code)) - regs = NULL; + sig = 0; /* Executive summary in case the body of the oops scrolled away */ printk(KERN_EMERG "CR2: %016lx\n", address); - oops_end(flags, regs, SIGKILL); + oops_end(flags, regs, sig); #endif /* -- cgit v1.2.3-70-g09d2 From 0ed7a498f416dcfa1cca478a559238a2a3396240 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Wed, 22 Oct 2008 12:00:10 +0200 Subject: x86_64, dumpstack: move kexec_crash from __die to oops_end oops_end is preceded by either a call to __die, or a conditional call to crash_kexec. Move the conditional call to crash_kexec from the end of __die to the start of oops_end and remove the superfluous call to crash_kexec in die_nmi. Signed-off-by: Alexander van Heukelum Acked-by: Neil Horman Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_64.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index ffefea611ba..57ce11b895c 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -458,6 +458,9 @@ unsigned __kprobes long oops_begin(void) void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) { + if (regs && kexec_should_crash(current)) + crash_kexec(regs); + die_owner = -1; bust_spinlocks(0); die_nest_count--; @@ -501,8 +504,6 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) printk(KERN_ALERT "RIP "); printk_address(regs->ip, 1); printk(" RSP <%016lx>\n", regs->sp); - if (kexec_should_crash(current)) - crash_kexec(regs); return 0; } @@ -536,11 +537,9 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic) printk(" on CPU%d, ip %08lx, registers:\n", smp_processor_id(), regs->ip); show_registers(regs); - if (kexec_should_crash(current)) - crash_kexec(regs); + oops_end(flags, regs, 0); if (do_panic || panic_on_oops) panic("Non maskable interrupt"); - oops_end(flags, regs, 0); nmi_exit(); local_irq_enable(); do_exit(SIGBUS); -- cgit v1.2.3-70-g09d2 From 10b14cb7eb7dd5bff8023f76a55c8ac20e586128 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Wed, 22 Oct 2008 12:00:11 +0200 Subject: x86, dumpstack: always call oops_exit from oops_end Always call oops_exit from oops_end, even if signr==0. Also, move add_taint(TAINT_DIE) from __die to oops_end on x86_64 and interchange two lines to make oops_end more similar to the i386-version. Signed-off-by: Alexander van Heukelum Acked-by: Neil Horman Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 2 +- arch/x86/kernel/dumpstack_64.c | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 7c22f99f0ef..a29b88ffa34 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -318,6 +318,7 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) __raw_spin_unlock(&die_lock); raw_local_irq_restore(flags); + oops_exit(); if (!signr) return; @@ -325,7 +326,6 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) panic("Fatal exception in interrupt"); if (panic_on_oops) panic("Fatal exception"); - oops_exit(); do_exit(signr); } diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 57ce11b895c..dc6162bf745 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -461,22 +461,22 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) if (regs && kexec_should_crash(current)) crash_kexec(regs); - die_owner = -1; bust_spinlocks(0); + die_owner = -1; + add_taint(TAINT_DIE); die_nest_count--; if (!die_nest_count) /* Nest count reaches zero, release the lock. */ __raw_spin_unlock(&die_lock); raw_local_irq_restore(flags); - if (!signr) { - oops_exit(); + oops_exit(); + + if (!signr) return; - } if (in_interrupt()) panic("Fatal exception in interrupt"); if (panic_on_oops) panic("Fatal exception"); - oops_exit(); do_exit(signr); } @@ -499,7 +499,6 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) return 1; show_registers(regs); - add_taint(TAINT_DIE); /* Executive summary in case the oops scrolled away */ printk(KERN_ALERT "RIP "); printk_address(regs->ip, 1); -- cgit v1.2.3-70-g09d2 From e4955cfd2f5c81eb708f55769aa60173f207fd63 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Wed, 22 Oct 2008 12:00:12 +0200 Subject: i386, dumpstack: use x86_64's method to account die_nest_count oops_begin/oops_end should always be used in pairs. On x86_64 oops_begin increments die_nest_count, and oops_end decrements die_nest_count. Doing this makes oops_begin and oops_end equal to the x86_64 versions. Signed-off-by: Alexander van Heukelum Acked-by: Neil Horman Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index a29b88ffa34..7c7d691b32b 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -289,21 +289,24 @@ static unsigned int die_nest_count; unsigned __kprobes long oops_begin(void) { + int cpu; unsigned long flags; oops_enter(); - if (die_owner != raw_smp_processor_id()) { - console_verbose(); - raw_local_irq_save(flags); - __raw_spin_lock(&die_lock); - die_owner = smp_processor_id(); - die_nest_count = 0; - bust_spinlocks(1); - } else { - raw_local_irq_save(flags); + /* racy, but better than risking deadlock. */ + raw_local_irq_save(flags); + cpu = smp_processor_id(); + if (!__raw_spin_trylock(&die_lock)) { + if (cpu == die_owner) + /* nested oops. should stop eventually */; + else + __raw_spin_lock(&die_lock); } die_nest_count++; + die_owner = cpu; + console_verbose(); + bust_spinlocks(1); return flags; } @@ -315,13 +318,15 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) bust_spinlocks(0); die_owner = -1; add_taint(TAINT_DIE); - __raw_spin_unlock(&die_lock); + die_nest_count--; + if (!die_nest_count) + /* Nest count reaches zero, release the lock. */ + __raw_spin_unlock(&die_lock); raw_local_irq_restore(flags); - oops_exit(); + if (!signr) return; - if (in_interrupt()) panic("Fatal exception in interrupt"); if (panic_on_oops) -- cgit v1.2.3-70-g09d2 From e06ca430c3d0fddbd1c901ab3fb3e1f0bc8a786b Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Wed, 22 Oct 2008 12:00:13 +0200 Subject: i386, dumpstack: use oops_begin/oops_end in die_nmi Use oops_begin and oops_end in die_nmi. Whitespace-only changes on x86_64, to make it equal to i386's version. Signed-off-by: Alexander van Heukelum Acked-by: Neil Horman Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 33 +++++++++++---------------------- arch/x86/kernel/dumpstack_64.c | 4 ++-- 2 files changed, 13 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 7c7d691b32b..e91ae34f968 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -390,40 +390,29 @@ void die(const char *str, struct pt_regs *regs, long err) oops_end(flags, regs, sig); } -static DEFINE_SPINLOCK(nmi_print_lock); - void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) { + unsigned long flags; + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) return; - spin_lock(&nmi_print_lock); /* - * We are in trouble anyway, lets at least try - * to get a message out: - */ - bust_spinlocks(1); + * We are in trouble anyway, lets at least try + * to get a message out. + */ + flags = oops_begin(); printk(KERN_EMERG "%s", str); printk(" on CPU%d, ip %08lx, registers:\n", smp_processor_id(), regs->ip); show_registers(regs); - if (do_panic) + oops_end(flags, regs, 0); + if (do_panic || panic_on_oops) panic("Non maskable interrupt"); - console_silent(); - spin_unlock(&nmi_print_lock); - - /* - * If we are in kernel we are probably nested up pretty bad - * and might aswell get out now while we still can: - */ - if (!user_mode_vm(regs)) { - current->thread.trap_no = 2; - crash_kexec(regs); - } - - bust_spinlocks(0); - do_exit(SIGSEGV); + nmi_exit(); + local_irq_enable(); + do_exit(SIGBUS); } static int __init oops_setup(char *s) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index dc6162bf745..831e1e159cb 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -519,7 +519,7 @@ void die(const char *str, struct pt_regs *regs, long err) oops_end(flags, regs, sig); } -notrace __kprobes void +void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) { unsigned long flags; @@ -527,11 +527,11 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic) if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) return; - flags = oops_begin(); /* * We are in trouble anyway, lets at least try * to get a message out. */ + flags = oops_begin(); printk(KERN_EMERG "%s", str); printk(" on CPU%d, ip %08lx, registers:\n", smp_processor_id(), regs->ip); -- cgit v1.2.3-70-g09d2 From 871d3779cba18b028e34d0d2f6cc6caae76a97b6 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Wed, 22 Oct 2008 12:00:14 +0200 Subject: i386, dumpstack: unify die() Make i386's die() equal to x86_64's version. Whitespace-only changes on x86_64, to make it equal to i386's version. (user_mode and user_mode_vm are equal on x86_64.) Signed-off-by: Alexander van Heukelum Acked-by: Neil Horman Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 10 +++------- arch/x86/kernel/dumpstack_64.c | 6 +++++- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index e91ae34f968..f2046c5752d 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -378,15 +378,11 @@ void die(const char *str, struct pt_regs *regs, long err) unsigned long flags = oops_begin(); int sig = SIGSEGV; - if (die_nest_count < 3) { + if (!user_mode_vm(regs)) report_bug(regs->ip, regs); - if (__die(str, regs, err)) - sig = 0; - } else { - printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); - } - + if (__die(str, regs, err)) + sig = 0; oops_end(flags, regs, sig); } diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 831e1e159cb..28c67aae556 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -506,12 +506,16 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) return 0; } +/* + * This is gone through when something in the kernel has done something bad + * and is about to be terminated: + */ void die(const char *str, struct pt_regs *regs, long err) { unsigned long flags = oops_begin(); int sig = SIGSEGV; - if (!user_mode(regs)) + if (!user_mode_vm(regs)) report_bug(regs->ip, regs); if (__die(str, regs, err)) -- cgit v1.2.3-70-g09d2 From 35af28219e684a36cc8b1ff456c370ce22be157d Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 22 Oct 2008 13:08:31 +0200 Subject: x86: call dmi-quirks for HP Laptops after early-quirks are executed Impact: make warning message disappear - functionality unchanged Problems with bogus IRQ0 override of those laptops should be fixed with commits x86: SB600: skip IRQ0 override if it is not routed to INT2 of IOAPIC x86: SB450: skip IRQ0 override if it is not routed to INT2 of IOAPIC that introduce early-quirks based on chipset configuration. For further information, see http://bugzilla.kernel.org/show_bug.cgi?id=11516 Instead of removing the related dmi-quirks completely we'd like to keep them for (at least) one kernel version -- to double-check whether the early-quirks really took effect. But the dmi-quirks need to be called after early-quirks are executed. With this patch calling sequence for dmi-quriks is changed as follows: acpi_boot_table_init() (dmi-quirks) ... early_quirks() (detect bogus IRQ0 override) ... acpi_boot_init() (late dmi-quirks and setup IO APIC) Note: Plan is to remove the "late dmi-quirks" with next kernel version. Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 0d1c26a583c..8072edb6247 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1598,6 +1598,11 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"), }, }, + {} +}; + +/* second table for DMI checks that should run after early-quirks */ +static struct dmi_system_id __initdata acpi_dmi_table_late[] = { /* * HP laptops which use a DSDT reporting as HP/SB400/10000, * which includes some code which overrides all temperature @@ -1726,6 +1731,9 @@ int __init early_acpi_boot_init(void) int __init acpi_boot_init(void) { + /* those are executed after early-quirks are executed */ + dmi_check_system(acpi_dmi_table_late); + /* * If acpi_disabled, bail out * One exception: acpi=ht continues far enough to enumerate LAPICs -- cgit v1.2.3-70-g09d2 From bc8bcc79ea4203c7d04309f1307ab88c86f0b0cf Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 22 Oct 2008 12:42:30 +0800 Subject: x86/proc: fix /proc/cpuinfo cpu offline bug Impact: fix missing CPUs in /proc/cpuinfo after CPU hotunplug/hotreplug In my test, I found that if a cpu has been offline, the next cpus may not be shown in the /proc/cpuinfo. if one read() cannot consume the whole /proc/cpuinfo, c_start() will be called again in the next read() calls. And *pos has been increased by 1 by the caller(seq_read()). if this time the cpu#*pos is offline, c_start() will return NULL, and the next cpus can not be shown. this fix use next_cpu_nr(*pos - 1, cpu_online_map) to search the next unshown cpu. the most easy way to reproduce this bug: 1) offline cpu#1 (cpu#0 is online) 2) dd ibs=2 if=/proc/cpuinfo the result is that only cpu#0 is shown. cpu#2 and cpu#3 .... cannot be shown in /proc/cpuinfo it's bug. Signed-off-by: Lai Jiangshan Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/proc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index a26c480b949..01b1244ef1c 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -160,14 +160,16 @@ static void *c_start(struct seq_file *m, loff_t *pos) { if (*pos == 0) /* just in case, cpu 0 is not the first */ *pos = first_cpu(cpu_online_map); - if ((*pos) < nr_cpu_ids && cpu_online(*pos)) + else + *pos = next_cpu_nr(*pos - 1, cpu_online_map); + if ((*pos) < nr_cpu_ids) return &cpu_data(*pos); return NULL; } static void *c_next(struct seq_file *m, void *v, loff_t *pos) { - *pos = next_cpu(*pos, cpu_online_map); + (*pos)++; return c_start(m, pos); } -- cgit v1.2.3-70-g09d2 From db96b0a0e4158806fcf03945a870f9320e6bac9b Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 22 Oct 2008 18:00:09 +0400 Subject: x86: do_boot_cpu - check if we have ESR register Impact: fix APIC IRQ irregularities on certain older boxes We should touch the APIC ESR register if only we have it. The patch fixes the problem mentioned by Max Kellermann: http://lkml.org/lkml/2008/10/17/147 Bisected-by: Max Kellermann Signed-off-by: Cyrill Gorcunov [ mingo@elte.hu: build fix ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7ece815ea63..7b109339731 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -893,9 +893,11 @@ do_rest: smpboot_setup_warm_reset_vector(start_ip); /* * Be paranoid about clearing APIC errors. - */ - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); + */ + if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + } } /* -- cgit v1.2.3-70-g09d2 From 55410791c91f448737c90585d2e280edd3c4d5c7 Mon Sep 17 00:00:00 2001 From: "Gustavo F. Padovan" Date: Wed, 15 Oct 2008 10:37:04 -0300 Subject: x86: remove redundant KERN_DEBUG on pr_debug pr_debug don't need KERN_DEBUG. Signed-off-by: Gustavo F. Padovan Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 2 +- arch/x86/kernel/setup_percpu.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 8072edb6247..dc3eac353db 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1136,7 +1136,7 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity) return gsi; } if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { - pr_debug(KERN_DEBUG "Pin %d-%d already programmed\n", + pr_debug("Pin %d-%d already programmed\n", mp_ioapic_routing[ioapic].apic_id, ioapic_pin); #ifdef CONFIG_X86_32 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 410c88f0bfe..ae0c0d3bb77 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -218,7 +218,7 @@ static void __init setup_node_to_cpumask_map(void) /* allocate the map */ map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); - pr_debug(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n", + pr_debug("Node to cpumask map at %p for %d nodes\n", map, nr_node_ids); /* node_to_cpumask() will now work */ -- cgit v1.2.3-70-g09d2 From aef8f5b8c2da706cb3efe701e2a35e11221ea5bd Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 14 Oct 2008 21:43:43 -0700 Subject: x86/tlb_uv: remove strange mc146818rtc include For some reason tlb_uv was including linux/mc146818rtc.h. It really just needs linux/seq_file.h Signed-off-by: Jeremy Fitzhardinge Cc: Cliff Wickman Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 8b8c0d6640f..04431f34fd1 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -6,7 +6,7 @@ * This code is released under the GNU General Public License version 2 or * later. */ -#include +#include #include #include -- cgit v1.2.3-70-g09d2 From 2cb0ebeeb664e6eefe06951709949203e04f7c7b Mon Sep 17 00:00:00 2001 From: Daniele Calore Date: Mon, 13 Oct 2008 10:34:12 +0200 Subject: x86: memtest fix use of reserve_early() Hi all, Wrong usage of 2nd parameter in reserve_early call. 66/75: reserve_early(start_bad, last_bad - start_bad, "BAD RAM"); ^^^^^^^^^^^^^^^^^^^^ The correct way is to use 'end' address and not 'size'. As a bonus a fix to the printk format. Signed-off-by: Daniele Calore Acked-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/memtest.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 672e17f8262..9cab18b0b85 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c @@ -61,9 +61,9 @@ static void __init memtest(unsigned long start_phys, unsigned long size, last_bad += incr; } else { if (start_bad) { - printk(KERN_CONT "\n %010lx bad mem addr %010lx - %010lx reserved", + printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved", val, start_bad, last_bad + incr); - reserve_early(start_bad, last_bad - start_bad, "BAD RAM"); + reserve_early(start_bad, last_bad + incr, "BAD RAM"); } start_bad = last_bad = start_phys_aligned; } @@ -72,9 +72,8 @@ static void __init memtest(unsigned long start_phys, unsigned long size, if (start_bad) { printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved", val, start_bad, last_bad + incr); - reserve_early(start_bad, last_bad - start_bad, "BAD RAM"); + reserve_early(start_bad, last_bad + incr, "BAD RAM"); } - } /* default is disabled */ -- cgit v1.2.3-70-g09d2 From 983f91ff949d9008eb7fb73ba4e35f9decc182b3 Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Sun, 12 Oct 2008 11:44:08 +0200 Subject: x86: fix section mismatch warning - apic_flat Impact: cleanup only, no functionality changed WARNING: vmlinux.o(.data+0xbe08): Section mismatch in reference from the variable apic_flat to the function .init.text:flat_acpi_madt_oem_check() The variable apic_flat references the function __init flat_acpi_madt_oem_check() This is harmless, because the .acpi_madt_oem_check is only called during init time. But we keep the function pointer around in a .data function pointer template, so it's better we do not keep that stale - so mark this function non-__init. Signed-off-by: Marcin Slusarz Signed-off-by: Ingo Molnar --- arch/x86/kernel/genapic_flat_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c index 2ec2de8d8c4..a10ed42a35e 100644 --- a/arch/x86/kernel/genapic_flat_64.c +++ b/arch/x86/kernel/genapic_flat_64.c @@ -25,7 +25,7 @@ #include #endif -static int __init flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return 1; } -- cgit v1.2.3-70-g09d2 From fae1721601a008fc83a809fdd76085a56f2275f0 Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Sun, 12 Oct 2008 11:44:09 +0200 Subject: x86: fix section mismatch warning - apic_physflat Impact: cleanup only, no functionality changed WARNING: vmlinux.o(.data+0xbe88): Section mismatch in reference from the variable apic_physflat to the function .init.text:physflat_acpi_madt_oem_check() The variable apic_physflat references the function __init physflat_acpi_madt_oem_check() Signed-off-by: Marcin Slusarz Signed-off-by: Ingo Molnar --- arch/x86/kernel/genapic_flat_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c index a10ed42a35e..c0262791bda 100644 --- a/arch/x86/kernel/genapic_flat_64.c +++ b/arch/x86/kernel/genapic_flat_64.c @@ -170,7 +170,7 @@ struct genapic apic_flat = { * We cannot use logical delivery in this case because the mask * overflows, so use physical mode. */ -static int __init physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { #ifdef CONFIG_ACPI /* -- cgit v1.2.3-70-g09d2 From f8827c017f19e1cd8834821f4303d5f163feab84 Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Sun, 12 Oct 2008 11:44:10 +0200 Subject: x86: fix section mismatch warning - apic_x2apic_uv_x Impact: cleanup only, no functionality changed WARNING: vmlinux.o(.data+0xbf08): Section mismatch in reference from the variable apic_x2apic_uv_x to the function .init.text:uv_acpi_madt_oem_check() The variable apic_x2apic_uv_x references the function __init uv_acpi_madt_oem_check() Signed-off-by: Marcin Slusarz Signed-off-by: Ingo Molnar --- arch/x86/kernel/genx2apic_uv_x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index bfd532843df..680a06557c5 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -30,7 +30,7 @@ DEFINE_PER_CPU(int, x2apic_extra_bits); static enum uv_system_type uv_system_type; -static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +static int uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { if (!strcmp(oem_id, "SGI")) { if (!strcmp(oem_table_id, "UVL")) -- cgit v1.2.3-70-g09d2 From 2caa37150d7aa110b6155cb77b07e581c103fef4 Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Sun, 12 Oct 2008 11:44:11 +0200 Subject: x86: fix section mismatch warning - apic_x2apic_cluster Impact: cleanup only, no functionality changed WARNING: vmlinux.o(.data+0xbf88): Section mismatch in reference from the variable apic_x2apic_cluster to the function .init.text:x2apic_acpi_madt_oem_check() The variable apic_x2apic_cluster references the function __init x2apic_acpi_madt_oem_check() Signed-off-by: Marcin Slusarz Signed-off-by: Ingo Molnar --- arch/x86/kernel/genx2apic_cluster.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c index e4bf2cc0d74..f6a2c8eb48a 100644 --- a/arch/x86/kernel/genx2apic_cluster.c +++ b/arch/x86/kernel/genx2apic_cluster.c @@ -12,7 +12,7 @@ DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); -static int __init x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { if (cpu_has_x2apic) return 1; -- cgit v1.2.3-70-g09d2 From 3cfba0892585d4c8e7b4122b5dc0d206a76936de Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Sun, 12 Oct 2008 11:46:23 +0200 Subject: x86: fix section mismatch warning - apic_x2apic_phys Impact: cleanup only, no functionality changed WARNING: vmlinux.o(.data+0xc008): Section mismatch in reference from the variable apic_x2apic_phys to the function .init.text:x2apic_acpi_madt_oem_check() The variable apic_x2apic_phys references the function __init x2apic_acpi_madt_oem_check() Signed-off-by: Marcin Slusarz Signed-off-by: Ingo Molnar --- arch/x86/kernel/genx2apic_phys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c index 8f1343df262..d042211768b 100644 --- a/arch/x86/kernel/genx2apic_phys.c +++ b/arch/x86/kernel/genx2apic_phys.c @@ -19,7 +19,7 @@ static int set_x2apic_phys_mode(char *arg) } early_param("x2apic_phys", set_x2apic_phys_mode); -static int __init x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { if (cpu_has_x2apic && x2apic_phys) return 1; -- cgit v1.2.3-70-g09d2 From bb8985586b7a906e116db835c64773b7a7d51663 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 17 Aug 2008 21:05:42 -0400 Subject: x86, um: ... and asm-x86 move Signed-off-by: Al Viro Signed-off-by: H. Peter Anvin --- arch/ia64/ia32/audit.c | 2 +- arch/um/Makefile | 4 +- arch/um/sys-x86_64/syscall_table.c | 4 +- arch/x86/Makefile | 6 +- arch/x86/include/asm/Kbuild | 24 + arch/x86/include/asm/a.out-core.h | 73 + arch/x86/include/asm/a.out.h | 20 + arch/x86/include/asm/acpi.h | 178 +++ arch/x86/include/asm/agp.h | 35 + arch/x86/include/asm/alternative-asm.h | 22 + arch/x86/include/asm/alternative.h | 183 +++ arch/x86/include/asm/amd_iommu.h | 35 + arch/x86/include/asm/amd_iommu_types.h | 404 +++++ arch/x86/include/asm/apic.h | 199 +++ arch/x86/include/asm/apicdef.h | 417 ++++++ arch/x86/include/asm/arch_hooks.h | 26 + arch/x86/include/asm/asm.h | 47 + arch/x86/include/asm/atomic.h | 5 + arch/x86/include/asm/atomic_32.h | 259 ++++ arch/x86/include/asm/atomic_64.h | 473 ++++++ arch/x86/include/asm/auxvec.h | 12 + arch/x86/include/asm/bigsmp/apic.h | 139 ++ arch/x86/include/asm/bigsmp/apicdef.h | 13 + arch/x86/include/asm/bigsmp/ipi.h | 25 + arch/x86/include/asm/bios_ebda.h | 36 + arch/x86/include/asm/bitops.h | 451 ++++++ arch/x86/include/asm/boot.h | 26 + arch/x86/include/asm/bootparam.h | 111 ++ arch/x86/include/asm/bug.h | 39 + arch/x86/include/asm/bugs.h | 12 + arch/x86/include/asm/byteorder.h | 81 + arch/x86/include/asm/cache.h | 20 + arch/x86/include/asm/cacheflush.h | 118 ++ arch/x86/include/asm/calgary.h | 72 + arch/x86/include/asm/calling.h | 170 +++ arch/x86/include/asm/checksum.h | 5 + arch/x86/include/asm/checksum_32.h | 189 +++ arch/x86/include/asm/checksum_64.h | 191 +++ arch/x86/include/asm/cmpxchg.h | 5 + arch/x86/include/asm/cmpxchg_32.h | 344 +++++ arch/x86/include/asm/cmpxchg_64.h | 185 +++ arch/x86/include/asm/compat.h | 218 +++ arch/x86/include/asm/cpu.h | 20 + arch/x86/include/asm/cpufeature.h | 271 ++++ arch/x86/include/asm/cputime.h | 1 + arch/x86/include/asm/current.h | 39 + arch/x86/include/asm/debugreg.h | 70 + arch/x86/include/asm/delay.h | 31 + arch/x86/include/asm/desc.h | 409 +++++ arch/x86/include/asm/desc_defs.h | 95 ++ arch/x86/include/asm/device.h | 16 + arch/x86/include/asm/div64.h | 60 + arch/x86/include/asm/dma-mapping.h | 308 ++++ arch/x86/include/asm/dma.h | 318 ++++ arch/x86/include/asm/dmi.h | 26 + arch/x86/include/asm/ds.h | 238 +++ arch/x86/include/asm/dwarf2.h | 61 + arch/x86/include/asm/e820.h | 146 ++ arch/x86/include/asm/edac.h | 18 + arch/x86/include/asm/efi.h | 110 ++ arch/x86/include/asm/elf.h | 336 +++++ arch/x86/include/asm/emergency-restart.h | 18 + arch/x86/include/asm/errno.h | 1 + arch/x86/include/asm/es7000/apic.h | 193 +++ arch/x86/include/asm/es7000/apicdef.h | 13 + arch/x86/include/asm/es7000/ipi.h | 24 + arch/x86/include/asm/es7000/mpparse.h | 30 + arch/x86/include/asm/es7000/wakecpu.h | 59 + arch/x86/include/asm/fb.h | 21 + arch/x86/include/asm/fcntl.h | 1 + arch/x86/include/asm/fixmap.h | 68 + arch/x86/include/asm/fixmap_32.h | 123 ++ arch/x86/include/asm/fixmap_64.h | 83 ++ arch/x86/include/asm/floppy.h | 281 ++++ arch/x86/include/asm/frame.h | 27 + arch/x86/include/asm/ftrace.h | 24 + arch/x86/include/asm/futex.h | 140 ++ arch/x86/include/asm/gart.h | 73 + arch/x86/include/asm/genapic.h | 5 + arch/x86/include/asm/genapic_32.h | 126 ++ arch/x86/include/asm/genapic_64.h | 58 + arch/x86/include/asm/geode.h | 253 ++++ arch/x86/include/asm/gpio.h | 56 + arch/x86/include/asm/hardirq.h | 11 + arch/x86/include/asm/hardirq_32.h | 28 + arch/x86/include/asm/hardirq_64.h | 23 + arch/x86/include/asm/highmem.h | 82 + arch/x86/include/asm/hpet.h | 114 ++ arch/x86/include/asm/hugetlb.h | 93 ++ arch/x86/include/asm/hw_irq.h | 131 ++ arch/x86/include/asm/hypertransport.h | 45 + arch/x86/include/asm/i387.h | 400 +++++ arch/x86/include/asm/i8253.h | 18 + arch/x86/include/asm/i8259.h | 63 + arch/x86/include/asm/ia32.h | 170 +++ arch/x86/include/asm/ia32_unistd.h | 18 + arch/x86/include/asm/idle.h | 15 + arch/x86/include/asm/intel_arch_perfmon.h | 31 + arch/x86/include/asm/io.h | 91 ++ arch/x86/include/asm/io_32.h | 284 ++++ arch/x86/include/asm/io_64.h | 244 +++ arch/x86/include/asm/io_apic.h | 204 +++ arch/x86/include/asm/ioctl.h | 1 + arch/x86/include/asm/ioctls.h | 94 ++ arch/x86/include/asm/iommu.h | 46 + arch/x86/include/asm/ipcbuf.h | 28 + arch/x86/include/asm/ipi.h | 138 ++ arch/x86/include/asm/irq.h | 50 + arch/x86/include/asm/irq_regs.h | 5 + arch/x86/include/asm/irq_regs_32.h | 29 + arch/x86/include/asm/irq_regs_64.h | 1 + arch/x86/include/asm/irq_remapping.h | 8 + arch/x86/include/asm/irq_vectors.h | 164 ++ arch/x86/include/asm/irqflags.h | 211 +++ arch/x86/include/asm/ist.h | 34 + arch/x86/include/asm/k8.h | 15 + arch/x86/include/asm/kdebug.h | 37 + arch/x86/include/asm/kexec.h | 175 +++ arch/x86/include/asm/kgdb.h | 79 + arch/x86/include/asm/kmap_types.h | 29 + arch/x86/include/asm/kprobes.h | 88 ++ arch/x86/include/asm/kvm.h | 211 +++ arch/x86/include/asm/kvm_host.h | 752 ++++++++++ arch/x86/include/asm/kvm_para.h | 147 ++ arch/x86/include/asm/kvm_x86_emulate.h | 184 +++ arch/x86/include/asm/ldt.h | 40 + arch/x86/include/asm/lguest.h | 94 ++ arch/x86/include/asm/lguest_hcall.h | 71 + arch/x86/include/asm/linkage.h | 61 + arch/x86/include/asm/local.h | 235 +++ arch/x86/include/asm/mach-default/apm.h | 73 + arch/x86/include/asm/mach-default/do_timer.h | 16 + arch/x86/include/asm/mach-default/entry_arch.h | 36 + arch/x86/include/asm/mach-default/mach_apic.h | 156 ++ arch/x86/include/asm/mach-default/mach_apicdef.h | 24 + arch/x86/include/asm/mach-default/mach_ipi.h | 64 + arch/x86/include/asm/mach-default/mach_mpparse.h | 17 + arch/x86/include/asm/mach-default/mach_mpspec.h | 12 + arch/x86/include/asm/mach-default/mach_timer.h | 48 + arch/x86/include/asm/mach-default/mach_traps.h | 33 + arch/x86/include/asm/mach-default/mach_wakecpu.h | 42 + arch/x86/include/asm/mach-default/pci-functions.h | 19 + arch/x86/include/asm/mach-default/setup_arch.h | 3 + arch/x86/include/asm/mach-default/smpboot_hooks.h | 59 + arch/x86/include/asm/mach-generic/gpio.h | 15 + arch/x86/include/asm/mach-generic/mach_apic.h | 33 + arch/x86/include/asm/mach-generic/mach_apicdef.h | 11 + arch/x86/include/asm/mach-generic/mach_ipi.h | 10 + arch/x86/include/asm/mach-generic/mach_mpparse.h | 10 + arch/x86/include/asm/mach-generic/mach_mpspec.h | 12 + arch/x86/include/asm/mach-rdc321x/gpio.h | 60 + arch/x86/include/asm/mach-rdc321x/rdc321x_defs.h | 12 + arch/x86/include/asm/mach-voyager/do_timer.h | 17 + arch/x86/include/asm/mach-voyager/entry_arch.h | 26 + arch/x86/include/asm/mach-voyager/setup_arch.h | 12 + arch/x86/include/asm/math_emu.h | 31 + arch/x86/include/asm/mc146818rtc.h | 104 ++ arch/x86/include/asm/mca.h | 43 + arch/x86/include/asm/mca_dma.h | 201 +++ arch/x86/include/asm/mce.h | 130 ++ arch/x86/include/asm/microcode.h | 47 + arch/x86/include/asm/mman.h | 20 + arch/x86/include/asm/mmconfig.h | 12 + arch/x86/include/asm/mmu.h | 26 + arch/x86/include/asm/mmu_context.h | 37 + arch/x86/include/asm/mmu_context_32.h | 56 + arch/x86/include/asm/mmu_context_64.h | 54 + arch/x86/include/asm/mmx.h | 14 + arch/x86/include/asm/mmzone.h | 5 + arch/x86/include/asm/mmzone_32.h | 134 ++ arch/x86/include/asm/mmzone_64.h | 51 + arch/x86/include/asm/module.h | 80 + arch/x86/include/asm/mpspec.h | 145 ++ arch/x86/include/asm/mpspec_def.h | 180 +++ arch/x86/include/asm/msgbuf.h | 39 + arch/x86/include/asm/msidef.h | 55 + arch/x86/include/asm/msr-index.h | 332 +++++ arch/x86/include/asm/msr.h | 247 +++ arch/x86/include/asm/mtrr.h | 173 +++ arch/x86/include/asm/mutex.h | 5 + arch/x86/include/asm/mutex_32.h | 125 ++ arch/x86/include/asm/mutex_64.h | 100 ++ arch/x86/include/asm/nmi.h | 81 + arch/x86/include/asm/nops.h | 118 ++ arch/x86/include/asm/numa.h | 5 + arch/x86/include/asm/numa_32.h | 11 + arch/x86/include/asm/numa_64.h | 43 + arch/x86/include/asm/numaq.h | 169 +++ arch/x86/include/asm/numaq/apic.h | 136 ++ arch/x86/include/asm/numaq/apicdef.h | 14 + arch/x86/include/asm/numaq/ipi.h | 25 + arch/x86/include/asm/numaq/mpparse.h | 7 + arch/x86/include/asm/numaq/wakecpu.h | 43 + arch/x86/include/asm/olpc.h | 132 ++ arch/x86/include/asm/page.h | 209 +++ arch/x86/include/asm/page_32.h | 136 ++ arch/x86/include/asm/page_64.h | 105 ++ arch/x86/include/asm/param.h | 22 + arch/x86/include/asm/paravirt.h | 1650 +++++++++++++++++++++ arch/x86/include/asm/parport.h | 10 + arch/x86/include/asm/pat.h | 22 + arch/x86/include/asm/pci-direct.h | 21 + arch/x86/include/asm/pci.h | 114 ++ arch/x86/include/asm/pci_32.h | 34 + arch/x86/include/asm/pci_64.h | 66 + arch/x86/include/asm/pda.h | 137 ++ arch/x86/include/asm/percpu.h | 218 +++ arch/x86/include/asm/pgalloc.h | 114 ++ arch/x86/include/asm/pgtable-2level-defs.h | 20 + arch/x86/include/asm/pgtable-2level.h | 79 + arch/x86/include/asm/pgtable-3level-defs.h | 28 + arch/x86/include/asm/pgtable-3level.h | 175 +++ arch/x86/include/asm/pgtable.h | 561 +++++++ arch/x86/include/asm/pgtable_32.h | 191 +++ arch/x86/include/asm/pgtable_64.h | 285 ++++ arch/x86/include/asm/poll.h | 1 + arch/x86/include/asm/posix_types.h | 13 + arch/x86/include/asm/posix_types_32.h | 85 ++ arch/x86/include/asm/posix_types_64.h | 119 ++ arch/x86/include/asm/prctl.h | 10 + arch/x86/include/asm/processor-cyrix.h | 38 + arch/x86/include/asm/processor-flags.h | 100 ++ arch/x86/include/asm/processor.h | 936 ++++++++++++ arch/x86/include/asm/proto.h | 32 + arch/x86/include/asm/ptrace-abi.h | 145 ++ arch/x86/include/asm/ptrace.h | 280 ++++ arch/x86/include/asm/pvclock-abi.h | 42 + arch/x86/include/asm/pvclock.h | 14 + arch/x86/include/asm/reboot.h | 21 + arch/x86/include/asm/reboot_fixups.h | 6 + arch/x86/include/asm/required-features.h | 82 + arch/x86/include/asm/resource.h | 1 + arch/x86/include/asm/resume-trace.h | 21 + arch/x86/include/asm/rio.h | 63 + arch/x86/include/asm/rtc.h | 1 + arch/x86/include/asm/rwlock.h | 8 + arch/x86/include/asm/rwsem.h | 265 ++++ arch/x86/include/asm/scatterlist.h | 33 + arch/x86/include/asm/seccomp.h | 5 + arch/x86/include/asm/seccomp_32.h | 17 + arch/x86/include/asm/seccomp_64.h | 25 + arch/x86/include/asm/sections.h | 1 + arch/x86/include/asm/segment.h | 209 +++ arch/x86/include/asm/sembuf.h | 24 + arch/x86/include/asm/serial.h | 29 + arch/x86/include/asm/setup.h | 105 ++ arch/x86/include/asm/shmbuf.h | 51 + arch/x86/include/asm/shmparam.h | 6 + arch/x86/include/asm/sigcontext.h | 284 ++++ arch/x86/include/asm/sigcontext32.h | 75 + arch/x86/include/asm/siginfo.h | 10 + arch/x86/include/asm/signal.h | 262 ++++ arch/x86/include/asm/smp.h | 229 +++ arch/x86/include/asm/socket.h | 57 + arch/x86/include/asm/sockios.h | 13 + arch/x86/include/asm/sparsemem.h | 34 + arch/x86/include/asm/spinlock.h | 364 +++++ arch/x86/include/asm/spinlock_types.h | 20 + arch/x86/include/asm/srat.h | 39 + arch/x86/include/asm/stacktrace.h | 21 + arch/x86/include/asm/stat.h | 114 ++ arch/x86/include/asm/statfs.h | 12 + arch/x86/include/asm/string.h | 5 + arch/x86/include/asm/string_32.h | 326 ++++ arch/x86/include/asm/string_64.h | 60 + arch/x86/include/asm/summit/apic.h | 184 +++ arch/x86/include/asm/summit/apicdef.h | 13 + arch/x86/include/asm/summit/ipi.h | 25 + arch/x86/include/asm/summit/mpparse.h | 109 ++ arch/x86/include/asm/suspend.h | 5 + arch/x86/include/asm/suspend_32.h | 51 + arch/x86/include/asm/suspend_64.h | 52 + arch/x86/include/asm/swiotlb.h | 58 + arch/x86/include/asm/sync_bitops.h | 130 ++ arch/x86/include/asm/syscall.h | 211 +++ arch/x86/include/asm/syscalls.h | 93 ++ arch/x86/include/asm/system.h | 425 ++++++ arch/x86/include/asm/system_64.h | 22 + arch/x86/include/asm/tce.h | 48 + arch/x86/include/asm/termbits.h | 198 +++ arch/x86/include/asm/termios.h | 113 ++ arch/x86/include/asm/therm_throt.h | 9 + arch/x86/include/asm/thread_info.h | 264 ++++ arch/x86/include/asm/time.h | 63 + arch/x86/include/asm/timer.h | 66 + arch/x86/include/asm/timex.h | 19 + arch/x86/include/asm/tlb.h | 11 + arch/x86/include/asm/tlbflush.h | 178 +++ arch/x86/include/asm/topology.h | 258 ++++ arch/x86/include/asm/trampoline.h | 21 + arch/x86/include/asm/traps.h | 81 + arch/x86/include/asm/tsc.h | 62 + arch/x86/include/asm/types.h | 36 + arch/x86/include/asm/uaccess.h | 454 ++++++ arch/x86/include/asm/uaccess_32.h | 218 +++ arch/x86/include/asm/uaccess_64.h | 202 +++ arch/x86/include/asm/ucontext.h | 18 + arch/x86/include/asm/unaligned.h | 14 + arch/x86/include/asm/unistd.h | 13 + arch/x86/include/asm/unistd_32.h | 379 +++++ arch/x86/include/asm/unistd_64.h | 693 +++++++++ arch/x86/include/asm/unwind.h | 13 + arch/x86/include/asm/user.h | 5 + arch/x86/include/asm/user32.h | 70 + arch/x86/include/asm/user_32.h | 131 ++ arch/x86/include/asm/user_64.h | 137 ++ arch/x86/include/asm/uv/bios.h | 94 ++ arch/x86/include/asm/uv/uv_bau.h | 332 +++++ arch/x86/include/asm/uv/uv_hub.h | 354 +++++ arch/x86/include/asm/uv/uv_irq.h | 36 + arch/x86/include/asm/uv/uv_mmrs.h | 1295 ++++++++++++++++ arch/x86/include/asm/vdso.h | 47 + arch/x86/include/asm/vga.h | 20 + arch/x86/include/asm/vgtod.h | 29 + arch/x86/include/asm/vic.h | 61 + arch/x86/include/asm/visws/cobalt.h | 125 ++ arch/x86/include/asm/visws/lithium.h | 53 + arch/x86/include/asm/visws/piix4.h | 107 ++ arch/x86/include/asm/visws/sgivw.h | 5 + arch/x86/include/asm/vm86.h | 208 +++ arch/x86/include/asm/vmi.h | 263 ++++ arch/x86/include/asm/vmi_time.h | 98 ++ arch/x86/include/asm/voyager.h | 528 +++++++ arch/x86/include/asm/vsyscall.h | 44 + arch/x86/include/asm/xcr.h | 49 + arch/x86/include/asm/xen/events.h | 24 + arch/x86/include/asm/xen/grant_table.h | 7 + arch/x86/include/asm/xen/hypercall.h | 527 +++++++ arch/x86/include/asm/xen/hypervisor.h | 82 + arch/x86/include/asm/xen/interface.h | 175 +++ arch/x86/include/asm/xen/interface_32.h | 97 ++ arch/x86/include/asm/xen/interface_64.h | 159 ++ arch/x86/include/asm/xen/page.h | 165 +++ arch/x86/include/asm/xor.h | 5 + arch/x86/include/asm/xor_32.h | 888 +++++++++++ arch/x86/include/asm/xor_64.h | 361 +++++ arch/x86/include/asm/xsave.h | 118 ++ arch/x86/kernel/cpu/Makefile | 2 +- drivers/xen/cpu_hotplug.c | 2 +- include/asm-x86/Kbuild | 24 - include/asm-x86/a.out-core.h | 73 - include/asm-x86/a.out.h | 20 - include/asm-x86/acpi.h | 178 --- include/asm-x86/agp.h | 35 - include/asm-x86/alternative-asm.h | 22 - include/asm-x86/alternative.h | 183 --- include/asm-x86/amd_iommu.h | 35 - include/asm-x86/amd_iommu_types.h | 404 ----- include/asm-x86/apic.h | 199 --- include/asm-x86/apicdef.h | 417 ------ include/asm-x86/arch_hooks.h | 26 - include/asm-x86/asm.h | 47 - include/asm-x86/atomic.h | 5 - include/asm-x86/atomic_32.h | 259 ---- include/asm-x86/atomic_64.h | 473 ------ include/asm-x86/auxvec.h | 12 - include/asm-x86/bigsmp/apic.h | 139 -- include/asm-x86/bigsmp/apicdef.h | 13 - include/asm-x86/bigsmp/ipi.h | 25 - include/asm-x86/bios_ebda.h | 36 - include/asm-x86/bitops.h | 451 ------ include/asm-x86/boot.h | 26 - include/asm-x86/bootparam.h | 111 -- include/asm-x86/bug.h | 39 - include/asm-x86/bugs.h | 12 - include/asm-x86/byteorder.h | 81 - include/asm-x86/cache.h | 20 - include/asm-x86/cacheflush.h | 118 -- include/asm-x86/calgary.h | 72 - include/asm-x86/calling.h | 170 --- include/asm-x86/checksum.h | 5 - include/asm-x86/checksum_32.h | 189 --- include/asm-x86/checksum_64.h | 191 --- include/asm-x86/cmpxchg.h | 5 - include/asm-x86/cmpxchg_32.h | 344 ----- include/asm-x86/cmpxchg_64.h | 185 --- include/asm-x86/compat.h | 218 --- include/asm-x86/cpu.h | 20 - include/asm-x86/cpufeature.h | 271 ---- include/asm-x86/cputime.h | 1 - include/asm-x86/current.h | 39 - include/asm-x86/debugreg.h | 70 - include/asm-x86/delay.h | 31 - include/asm-x86/desc.h | 409 ----- include/asm-x86/desc_defs.h | 95 -- include/asm-x86/device.h | 16 - include/asm-x86/div64.h | 60 - include/asm-x86/dma-mapping.h | 308 ---- include/asm-x86/dma.h | 318 ---- include/asm-x86/dmi.h | 26 - include/asm-x86/ds.h | 238 --- include/asm-x86/dwarf2.h | 61 - include/asm-x86/e820.h | 146 -- include/asm-x86/edac.h | 18 - include/asm-x86/efi.h | 110 -- include/asm-x86/elf.h | 336 ----- include/asm-x86/emergency-restart.h | 18 - include/asm-x86/errno.h | 1 - include/asm-x86/es7000/apic.h | 193 --- include/asm-x86/es7000/apicdef.h | 13 - include/asm-x86/es7000/ipi.h | 24 - include/asm-x86/es7000/mpparse.h | 30 - include/asm-x86/es7000/wakecpu.h | 59 - include/asm-x86/fb.h | 21 - include/asm-x86/fcntl.h | 1 - include/asm-x86/fixmap.h | 68 - include/asm-x86/fixmap_32.h | 123 -- include/asm-x86/fixmap_64.h | 83 -- include/asm-x86/floppy.h | 281 ---- include/asm-x86/frame.h | 27 - include/asm-x86/ftrace.h | 24 - include/asm-x86/futex.h | 140 -- include/asm-x86/gart.h | 73 - include/asm-x86/genapic.h | 5 - include/asm-x86/genapic_32.h | 126 -- include/asm-x86/genapic_64.h | 58 - include/asm-x86/geode.h | 253 ---- include/asm-x86/gpio.h | 56 - include/asm-x86/hardirq.h | 11 - include/asm-x86/hardirq_32.h | 28 - include/asm-x86/hardirq_64.h | 23 - include/asm-x86/highmem.h | 82 - include/asm-x86/hpet.h | 114 -- include/asm-x86/hugetlb.h | 93 -- include/asm-x86/hw_irq.h | 131 -- include/asm-x86/hypertransport.h | 45 - include/asm-x86/i387.h | 400 ----- include/asm-x86/i8253.h | 18 - include/asm-x86/i8259.h | 63 - include/asm-x86/ia32.h | 170 --- include/asm-x86/ia32_unistd.h | 18 - include/asm-x86/idle.h | 15 - include/asm-x86/intel_arch_perfmon.h | 31 - include/asm-x86/io.h | 91 -- include/asm-x86/io_32.h | 284 ---- include/asm-x86/io_64.h | 244 --- include/asm-x86/io_apic.h | 204 --- include/asm-x86/ioctl.h | 1 - include/asm-x86/ioctls.h | 94 -- include/asm-x86/iommu.h | 46 - include/asm-x86/ipcbuf.h | 28 - include/asm-x86/ipi.h | 138 -- include/asm-x86/irq.h | 50 - include/asm-x86/irq_regs.h | 5 - include/asm-x86/irq_regs_32.h | 29 - include/asm-x86/irq_regs_64.h | 1 - include/asm-x86/irq_remapping.h | 8 - include/asm-x86/irq_vectors.h | 164 -- include/asm-x86/irqflags.h | 211 --- include/asm-x86/ist.h | 34 - include/asm-x86/k8.h | 15 - include/asm-x86/kdebug.h | 37 - include/asm-x86/kexec.h | 175 --- include/asm-x86/kgdb.h | 79 - include/asm-x86/kmap_types.h | 29 - include/asm-x86/kprobes.h | 88 -- include/asm-x86/kvm.h | 211 --- include/asm-x86/kvm_host.h | 752 ---------- include/asm-x86/kvm_para.h | 147 -- include/asm-x86/kvm_x86_emulate.h | 184 --- include/asm-x86/ldt.h | 40 - include/asm-x86/lguest.h | 94 -- include/asm-x86/lguest_hcall.h | 71 - include/asm-x86/linkage.h | 61 - include/asm-x86/local.h | 235 --- include/asm-x86/mach-default/apm.h | 73 - include/asm-x86/mach-default/do_timer.h | 16 - include/asm-x86/mach-default/entry_arch.h | 36 - include/asm-x86/mach-default/mach_apic.h | 156 -- include/asm-x86/mach-default/mach_apicdef.h | 24 - include/asm-x86/mach-default/mach_ipi.h | 64 - include/asm-x86/mach-default/mach_mpparse.h | 17 - include/asm-x86/mach-default/mach_mpspec.h | 12 - include/asm-x86/mach-default/mach_timer.h | 48 - include/asm-x86/mach-default/mach_traps.h | 33 - include/asm-x86/mach-default/mach_wakecpu.h | 42 - include/asm-x86/mach-default/pci-functions.h | 19 - include/asm-x86/mach-default/setup_arch.h | 3 - include/asm-x86/mach-default/smpboot_hooks.h | 59 - include/asm-x86/mach-generic/gpio.h | 15 - include/asm-x86/mach-generic/mach_apic.h | 33 - include/asm-x86/mach-generic/mach_apicdef.h | 11 - include/asm-x86/mach-generic/mach_ipi.h | 10 - include/asm-x86/mach-generic/mach_mpparse.h | 10 - include/asm-x86/mach-generic/mach_mpspec.h | 12 - include/asm-x86/mach-rdc321x/gpio.h | 60 - include/asm-x86/mach-rdc321x/rdc321x_defs.h | 12 - include/asm-x86/mach-voyager/do_timer.h | 17 - include/asm-x86/mach-voyager/entry_arch.h | 26 - include/asm-x86/mach-voyager/setup_arch.h | 12 - include/asm-x86/math_emu.h | 31 - include/asm-x86/mc146818rtc.h | 104 -- include/asm-x86/mca.h | 43 - include/asm-x86/mca_dma.h | 201 --- include/asm-x86/mce.h | 130 -- include/asm-x86/microcode.h | 47 - include/asm-x86/mman.h | 20 - include/asm-x86/mmconfig.h | 12 - include/asm-x86/mmu.h | 26 - include/asm-x86/mmu_context.h | 37 - include/asm-x86/mmu_context_32.h | 56 - include/asm-x86/mmu_context_64.h | 54 - include/asm-x86/mmx.h | 14 - include/asm-x86/mmzone.h | 5 - include/asm-x86/mmzone_32.h | 134 -- include/asm-x86/mmzone_64.h | 51 - include/asm-x86/module.h | 80 - include/asm-x86/mpspec.h | 145 -- include/asm-x86/mpspec_def.h | 180 --- include/asm-x86/msgbuf.h | 39 - include/asm-x86/msidef.h | 55 - include/asm-x86/msr-index.h | 332 ----- include/asm-x86/msr.h | 247 --- include/asm-x86/mtrr.h | 173 --- include/asm-x86/mutex.h | 5 - include/asm-x86/mutex_32.h | 125 -- include/asm-x86/mutex_64.h | 100 -- include/asm-x86/nmi.h | 81 - include/asm-x86/nops.h | 118 -- include/asm-x86/numa.h | 5 - include/asm-x86/numa_32.h | 11 - include/asm-x86/numa_64.h | 43 - include/asm-x86/numaq.h | 169 --- include/asm-x86/numaq/apic.h | 136 -- include/asm-x86/numaq/apicdef.h | 14 - include/asm-x86/numaq/ipi.h | 25 - include/asm-x86/numaq/mpparse.h | 7 - include/asm-x86/numaq/wakecpu.h | 43 - include/asm-x86/olpc.h | 132 -- include/asm-x86/page.h | 209 --- include/asm-x86/page_32.h | 136 -- include/asm-x86/page_64.h | 105 -- include/asm-x86/param.h | 22 - include/asm-x86/paravirt.h | 1650 --------------------- include/asm-x86/parport.h | 10 - include/asm-x86/pat.h | 22 - include/asm-x86/pci-direct.h | 21 - include/asm-x86/pci.h | 114 -- include/asm-x86/pci_32.h | 34 - include/asm-x86/pci_64.h | 66 - include/asm-x86/pda.h | 137 -- include/asm-x86/percpu.h | 218 --- include/asm-x86/pgalloc.h | 114 -- include/asm-x86/pgtable-2level-defs.h | 20 - include/asm-x86/pgtable-2level.h | 79 - include/asm-x86/pgtable-3level-defs.h | 28 - include/asm-x86/pgtable-3level.h | 175 --- include/asm-x86/pgtable.h | 561 ------- include/asm-x86/pgtable_32.h | 191 --- include/asm-x86/pgtable_64.h | 285 ---- include/asm-x86/poll.h | 1 - include/asm-x86/posix_types.h | 13 - include/asm-x86/posix_types_32.h | 85 -- include/asm-x86/posix_types_64.h | 119 -- include/asm-x86/prctl.h | 10 - include/asm-x86/processor-cyrix.h | 38 - include/asm-x86/processor-flags.h | 100 -- include/asm-x86/processor.h | 936 ------------ include/asm-x86/proto.h | 32 - include/asm-x86/ptrace-abi.h | 145 -- include/asm-x86/ptrace.h | 280 ---- include/asm-x86/pvclock-abi.h | 42 - include/asm-x86/pvclock.h | 14 - include/asm-x86/reboot.h | 21 - include/asm-x86/reboot_fixups.h | 6 - include/asm-x86/required-features.h | 82 - include/asm-x86/resource.h | 1 - include/asm-x86/resume-trace.h | 21 - include/asm-x86/rio.h | 63 - include/asm-x86/rtc.h | 1 - include/asm-x86/rwlock.h | 8 - include/asm-x86/rwsem.h | 265 ---- include/asm-x86/scatterlist.h | 33 - include/asm-x86/seccomp.h | 5 - include/asm-x86/seccomp_32.h | 17 - include/asm-x86/seccomp_64.h | 25 - include/asm-x86/sections.h | 1 - include/asm-x86/segment.h | 209 --- include/asm-x86/sembuf.h | 24 - include/asm-x86/serial.h | 29 - include/asm-x86/setup.h | 105 -- include/asm-x86/shmbuf.h | 51 - include/asm-x86/shmparam.h | 6 - include/asm-x86/sigcontext.h | 284 ---- include/asm-x86/sigcontext32.h | 75 - include/asm-x86/siginfo.h | 10 - include/asm-x86/signal.h | 262 ---- include/asm-x86/smp.h | 229 --- include/asm-x86/socket.h | 57 - include/asm-x86/sockios.h | 13 - include/asm-x86/sparsemem.h | 34 - include/asm-x86/spinlock.h | 364 ----- include/asm-x86/spinlock_types.h | 20 - include/asm-x86/srat.h | 39 - include/asm-x86/stacktrace.h | 21 - include/asm-x86/stat.h | 114 -- include/asm-x86/statfs.h | 12 - include/asm-x86/string.h | 5 - include/asm-x86/string_32.h | 326 ---- include/asm-x86/string_64.h | 60 - include/asm-x86/summit/apic.h | 184 --- include/asm-x86/summit/apicdef.h | 13 - include/asm-x86/summit/ipi.h | 25 - include/asm-x86/summit/mpparse.h | 109 -- include/asm-x86/suspend.h | 5 - include/asm-x86/suspend_32.h | 51 - include/asm-x86/suspend_64.h | 52 - include/asm-x86/swiotlb.h | 58 - include/asm-x86/sync_bitops.h | 130 -- include/asm-x86/syscall.h | 211 --- include/asm-x86/syscalls.h | 93 -- include/asm-x86/system.h | 425 ------ include/asm-x86/system_64.h | 22 - include/asm-x86/tce.h | 48 - include/asm-x86/termbits.h | 198 --- include/asm-x86/termios.h | 113 -- include/asm-x86/therm_throt.h | 9 - include/asm-x86/thread_info.h | 264 ---- include/asm-x86/time.h | 63 - include/asm-x86/timer.h | 66 - include/asm-x86/timex.h | 19 - include/asm-x86/tlb.h | 11 - include/asm-x86/tlbflush.h | 178 --- include/asm-x86/topology.h | 258 ---- include/asm-x86/trampoline.h | 21 - include/asm-x86/traps.h | 81 - include/asm-x86/tsc.h | 62 - include/asm-x86/types.h | 36 - include/asm-x86/uaccess.h | 454 ------ include/asm-x86/uaccess_32.h | 218 --- include/asm-x86/uaccess_64.h | 202 --- include/asm-x86/ucontext.h | 18 - include/asm-x86/unaligned.h | 14 - include/asm-x86/unistd.h | 13 - include/asm-x86/unistd_32.h | 379 ----- include/asm-x86/unistd_64.h | 693 --------- include/asm-x86/unwind.h | 13 - include/asm-x86/user.h | 5 - include/asm-x86/user32.h | 70 - include/asm-x86/user_32.h | 131 -- include/asm-x86/user_64.h | 137 -- include/asm-x86/uv/bios.h | 94 -- include/asm-x86/uv/uv_bau.h | 332 ----- include/asm-x86/uv/uv_hub.h | 354 ----- include/asm-x86/uv/uv_irq.h | 36 - include/asm-x86/uv/uv_mmrs.h | 1295 ---------------- include/asm-x86/vdso.h | 47 - include/asm-x86/vga.h | 20 - include/asm-x86/vgtod.h | 29 - include/asm-x86/vic.h | 61 - include/asm-x86/visws/cobalt.h | 125 -- include/asm-x86/visws/lithium.h | 53 - include/asm-x86/visws/piix4.h | 107 -- include/asm-x86/visws/sgivw.h | 5 - include/asm-x86/vm86.h | 208 --- include/asm-x86/vmi.h | 263 ---- include/asm-x86/vmi_time.h | 98 -- include/asm-x86/voyager.h | 528 ------- include/asm-x86/vsyscall.h | 44 - include/asm-x86/xcr.h | 49 - include/asm-x86/xen/events.h | 24 - include/asm-x86/xen/grant_table.h | 7 - include/asm-x86/xen/hypercall.h | 527 ------- include/asm-x86/xen/hypervisor.h | 82 - include/asm-x86/xen/interface.h | 175 --- include/asm-x86/xen/interface_32.h | 97 -- include/asm-x86/xen/interface_64.h | 159 -- include/asm-x86/xen/page.h | 165 --- include/asm-x86/xor.h | 5 - include/asm-x86/xor_32.h | 888 ----------- include/asm-x86/xor_64.h | 361 ----- include/asm-x86/xsave.h | 118 -- scripts/checksyscalls.sh | 2 +- 673 files changed, 38683 insertions(+), 38683 deletions(-) create mode 100644 arch/x86/include/asm/Kbuild create mode 100644 arch/x86/include/asm/a.out-core.h create mode 100644 arch/x86/include/asm/a.out.h create mode 100644 arch/x86/include/asm/acpi.h create mode 100644 arch/x86/include/asm/agp.h create mode 100644 arch/x86/include/asm/alternative-asm.h create mode 100644 arch/x86/include/asm/alternative.h create mode 100644 arch/x86/include/asm/amd_iommu.h create mode 100644 arch/x86/include/asm/amd_iommu_types.h create mode 100644 arch/x86/include/asm/apic.h create mode 100644 arch/x86/include/asm/apicdef.h create mode 100644 arch/x86/include/asm/arch_hooks.h create mode 100644 arch/x86/include/asm/asm.h create mode 100644 arch/x86/include/asm/atomic.h create mode 100644 arch/x86/include/asm/atomic_32.h create mode 100644 arch/x86/include/asm/atomic_64.h create mode 100644 arch/x86/include/asm/auxvec.h create mode 100644 arch/x86/include/asm/bigsmp/apic.h create mode 100644 arch/x86/include/asm/bigsmp/apicdef.h create mode 100644 arch/x86/include/asm/bigsmp/ipi.h create mode 100644 arch/x86/include/asm/bios_ebda.h create mode 100644 arch/x86/include/asm/bitops.h create mode 100644 arch/x86/include/asm/boot.h create mode 100644 arch/x86/include/asm/bootparam.h create mode 100644 arch/x86/include/asm/bug.h create mode 100644 arch/x86/include/asm/bugs.h create mode 100644 arch/x86/include/asm/byteorder.h create mode 100644 arch/x86/include/asm/cache.h create mode 100644 arch/x86/include/asm/cacheflush.h create mode 100644 arch/x86/include/asm/calgary.h create mode 100644 arch/x86/include/asm/calling.h create mode 100644 arch/x86/include/asm/checksum.h create mode 100644 arch/x86/include/asm/checksum_32.h create mode 100644 arch/x86/include/asm/checksum_64.h create mode 100644 arch/x86/include/asm/cmpxchg.h create mode 100644 arch/x86/include/asm/cmpxchg_32.h create mode 100644 arch/x86/include/asm/cmpxchg_64.h create mode 100644 arch/x86/include/asm/compat.h create mode 100644 arch/x86/include/asm/cpu.h create mode 100644 arch/x86/include/asm/cpufeature.h create mode 100644 arch/x86/include/asm/cputime.h create mode 100644 arch/x86/include/asm/current.h create mode 100644 arch/x86/include/asm/debugreg.h create mode 100644 arch/x86/include/asm/delay.h create mode 100644 arch/x86/include/asm/desc.h create mode 100644 arch/x86/include/asm/desc_defs.h create mode 100644 arch/x86/include/asm/device.h create mode 100644 arch/x86/include/asm/div64.h create mode 100644 arch/x86/include/asm/dma-mapping.h create mode 100644 arch/x86/include/asm/dma.h create mode 100644 arch/x86/include/asm/dmi.h create mode 100644 arch/x86/include/asm/ds.h create mode 100644 arch/x86/include/asm/dwarf2.h create mode 100644 arch/x86/include/asm/e820.h create mode 100644 arch/x86/include/asm/edac.h create mode 100644 arch/x86/include/asm/efi.h create mode 100644 arch/x86/include/asm/elf.h create mode 100644 arch/x86/include/asm/emergency-restart.h create mode 100644 arch/x86/include/asm/errno.h create mode 100644 arch/x86/include/asm/es7000/apic.h create mode 100644 arch/x86/include/asm/es7000/apicdef.h create mode 100644 arch/x86/include/asm/es7000/ipi.h create mode 100644 arch/x86/include/asm/es7000/mpparse.h create mode 100644 arch/x86/include/asm/es7000/wakecpu.h create mode 100644 arch/x86/include/asm/fb.h create mode 100644 arch/x86/include/asm/fcntl.h create mode 100644 arch/x86/include/asm/fixmap.h create mode 100644 arch/x86/include/asm/fixmap_32.h create mode 100644 arch/x86/include/asm/fixmap_64.h create mode 100644 arch/x86/include/asm/floppy.h create mode 100644 arch/x86/include/asm/frame.h create mode 100644 arch/x86/include/asm/ftrace.h create mode 100644 arch/x86/include/asm/futex.h create mode 100644 arch/x86/include/asm/gart.h create mode 100644 arch/x86/include/asm/genapic.h create mode 100644 arch/x86/include/asm/genapic_32.h create mode 100644 arch/x86/include/asm/genapic_64.h create mode 100644 arch/x86/include/asm/geode.h create mode 100644 arch/x86/include/asm/gpio.h create mode 100644 arch/x86/include/asm/hardirq.h create mode 100644 arch/x86/include/asm/hardirq_32.h create mode 100644 arch/x86/include/asm/hardirq_64.h create mode 100644 arch/x86/include/asm/highmem.h create mode 100644 arch/x86/include/asm/hpet.h create mode 100644 arch/x86/include/asm/hugetlb.h create mode 100644 arch/x86/include/asm/hw_irq.h create mode 100644 arch/x86/include/asm/hypertransport.h create mode 100644 arch/x86/include/asm/i387.h create mode 100644 arch/x86/include/asm/i8253.h create mode 100644 arch/x86/include/asm/i8259.h create mode 100644 arch/x86/include/asm/ia32.h create mode 100644 arch/x86/include/asm/ia32_unistd.h create mode 100644 arch/x86/include/asm/idle.h create mode 100644 arch/x86/include/asm/intel_arch_perfmon.h create mode 100644 arch/x86/include/asm/io.h create mode 100644 arch/x86/include/asm/io_32.h create mode 100644 arch/x86/include/asm/io_64.h create mode 100644 arch/x86/include/asm/io_apic.h create mode 100644 arch/x86/include/asm/ioctl.h create mode 100644 arch/x86/include/asm/ioctls.h create mode 100644 arch/x86/include/asm/iommu.h create mode 100644 arch/x86/include/asm/ipcbuf.h create mode 100644 arch/x86/include/asm/ipi.h create mode 100644 arch/x86/include/asm/irq.h create mode 100644 arch/x86/include/asm/irq_regs.h create mode 100644 arch/x86/include/asm/irq_regs_32.h create mode 100644 arch/x86/include/asm/irq_regs_64.h create mode 100644 arch/x86/include/asm/irq_remapping.h create mode 100644 arch/x86/include/asm/irq_vectors.h create mode 100644 arch/x86/include/asm/irqflags.h create mode 100644 arch/x86/include/asm/ist.h create mode 100644 arch/x86/include/asm/k8.h create mode 100644 arch/x86/include/asm/kdebug.h create mode 100644 arch/x86/include/asm/kexec.h create mode 100644 arch/x86/include/asm/kgdb.h create mode 100644 arch/x86/include/asm/kmap_types.h create mode 100644 arch/x86/include/asm/kprobes.h create mode 100644 arch/x86/include/asm/kvm.h create mode 100644 arch/x86/include/asm/kvm_host.h create mode 100644 arch/x86/include/asm/kvm_para.h create mode 100644 arch/x86/include/asm/kvm_x86_emulate.h create mode 100644 arch/x86/include/asm/ldt.h create mode 100644 arch/x86/include/asm/lguest.h create mode 100644 arch/x86/include/asm/lguest_hcall.h create mode 100644 arch/x86/include/asm/linkage.h create mode 100644 arch/x86/include/asm/local.h create mode 100644 arch/x86/include/asm/mach-default/apm.h create mode 100644 arch/x86/include/asm/mach-default/do_timer.h create mode 100644 arch/x86/include/asm/mach-default/entry_arch.h create mode 100644 arch/x86/include/asm/mach-default/mach_apic.h create mode 100644 arch/x86/include/asm/mach-default/mach_apicdef.h create mode 100644 arch/x86/include/asm/mach-default/mach_ipi.h create mode 100644 arch/x86/include/asm/mach-default/mach_mpparse.h create mode 100644 arch/x86/include/asm/mach-default/mach_mpspec.h create mode 100644 arch/x86/include/asm/mach-default/mach_timer.h create mode 100644 arch/x86/include/asm/mach-default/mach_traps.h create mode 100644 arch/x86/include/asm/mach-default/mach_wakecpu.h create mode 100644 arch/x86/include/asm/mach-default/pci-functions.h create mode 100644 arch/x86/include/asm/mach-default/setup_arch.h create mode 100644 arch/x86/include/asm/mach-default/smpboot_hooks.h create mode 100644 arch/x86/include/asm/mach-generic/gpio.h create mode 100644 arch/x86/include/asm/mach-generic/mach_apic.h create mode 100644 arch/x86/include/asm/mach-generic/mach_apicdef.h create mode 100644 arch/x86/include/asm/mach-generic/mach_ipi.h create mode 100644 arch/x86/include/asm/mach-generic/mach_mpparse.h create mode 100644 arch/x86/include/asm/mach-generic/mach_mpspec.h create mode 100644 arch/x86/include/asm/mach-rdc321x/gpio.h create mode 100644 arch/x86/include/asm/mach-rdc321x/rdc321x_defs.h create mode 100644 arch/x86/include/asm/mach-voyager/do_timer.h create mode 100644 arch/x86/include/asm/mach-voyager/entry_arch.h create mode 100644 arch/x86/include/asm/mach-voyager/setup_arch.h create mode 100644 arch/x86/include/asm/math_emu.h create mode 100644 arch/x86/include/asm/mc146818rtc.h create mode 100644 arch/x86/include/asm/mca.h create mode 100644 arch/x86/include/asm/mca_dma.h create mode 100644 arch/x86/include/asm/mce.h create mode 100644 arch/x86/include/asm/microcode.h create mode 100644 arch/x86/include/asm/mman.h create mode 100644 arch/x86/include/asm/mmconfig.h create mode 100644 arch/x86/include/asm/mmu.h create mode 100644 arch/x86/include/asm/mmu_context.h create mode 100644 arch/x86/include/asm/mmu_context_32.h create mode 100644 arch/x86/include/asm/mmu_context_64.h create mode 100644 arch/x86/include/asm/mmx.h create mode 100644 arch/x86/include/asm/mmzone.h create mode 100644 arch/x86/include/asm/mmzone_32.h create mode 100644 arch/x86/include/asm/mmzone_64.h create mode 100644 arch/x86/include/asm/module.h create mode 100644 arch/x86/include/asm/mpspec.h create mode 100644 arch/x86/include/asm/mpspec_def.h create mode 100644 arch/x86/include/asm/msgbuf.h create mode 100644 arch/x86/include/asm/msidef.h create mode 100644 arch/x86/include/asm/msr-index.h create mode 100644 arch/x86/include/asm/msr.h create mode 100644 arch/x86/include/asm/mtrr.h create mode 100644 arch/x86/include/asm/mutex.h create mode 100644 arch/x86/include/asm/mutex_32.h create mode 100644 arch/x86/include/asm/mutex_64.h create mode 100644 arch/x86/include/asm/nmi.h create mode 100644 arch/x86/include/asm/nops.h create mode 100644 arch/x86/include/asm/numa.h create mode 100644 arch/x86/include/asm/numa_32.h create mode 100644 arch/x86/include/asm/numa_64.h create mode 100644 arch/x86/include/asm/numaq.h create mode 100644 arch/x86/include/asm/numaq/apic.h create mode 100644 arch/x86/include/asm/numaq/apicdef.h create mode 100644 arch/x86/include/asm/numaq/ipi.h create mode 100644 arch/x86/include/asm/numaq/mpparse.h create mode 100644 arch/x86/include/asm/numaq/wakecpu.h create mode 100644 arch/x86/include/asm/olpc.h create mode 100644 arch/x86/include/asm/page.h create mode 100644 arch/x86/include/asm/page_32.h create mode 100644 arch/x86/include/asm/page_64.h create mode 100644 arch/x86/include/asm/param.h create mode 100644 arch/x86/include/asm/paravirt.h create mode 100644 arch/x86/include/asm/parport.h create mode 100644 arch/x86/include/asm/pat.h create mode 100644 arch/x86/include/asm/pci-direct.h create mode 100644 arch/x86/include/asm/pci.h create mode 100644 arch/x86/include/asm/pci_32.h create mode 100644 arch/x86/include/asm/pci_64.h create mode 100644 arch/x86/include/asm/pda.h create mode 100644 arch/x86/include/asm/percpu.h create mode 100644 arch/x86/include/asm/pgalloc.h create mode 100644 arch/x86/include/asm/pgtable-2level-defs.h create mode 100644 arch/x86/include/asm/pgtable-2level.h create mode 100644 arch/x86/include/asm/pgtable-3level-defs.h create mode 100644 arch/x86/include/asm/pgtable-3level.h create mode 100644 arch/x86/include/asm/pgtable.h create mode 100644 arch/x86/include/asm/pgtable_32.h create mode 100644 arch/x86/include/asm/pgtable_64.h create mode 100644 arch/x86/include/asm/poll.h create mode 100644 arch/x86/include/asm/posix_types.h create mode 100644 arch/x86/include/asm/posix_types_32.h create mode 100644 arch/x86/include/asm/posix_types_64.h create mode 100644 arch/x86/include/asm/prctl.h create mode 100644 arch/x86/include/asm/processor-cyrix.h create mode 100644 arch/x86/include/asm/processor-flags.h create mode 100644 arch/x86/include/asm/processor.h create mode 100644 arch/x86/include/asm/proto.h create mode 100644 arch/x86/include/asm/ptrace-abi.h create mode 100644 arch/x86/include/asm/ptrace.h create mode 100644 arch/x86/include/asm/pvclock-abi.h create mode 100644 arch/x86/include/asm/pvclock.h create mode 100644 arch/x86/include/asm/reboot.h create mode 100644 arch/x86/include/asm/reboot_fixups.h create mode 100644 arch/x86/include/asm/required-features.h create mode 100644 arch/x86/include/asm/resource.h create mode 100644 arch/x86/include/asm/resume-trace.h create mode 100644 arch/x86/include/asm/rio.h create mode 100644 arch/x86/include/asm/rtc.h create mode 100644 arch/x86/include/asm/rwlock.h create mode 100644 arch/x86/include/asm/rwsem.h create mode 100644 arch/x86/include/asm/scatterlist.h create mode 100644 arch/x86/include/asm/seccomp.h create mode 100644 arch/x86/include/asm/seccomp_32.h create mode 100644 arch/x86/include/asm/seccomp_64.h create mode 100644 arch/x86/include/asm/sections.h create mode 100644 arch/x86/include/asm/segment.h create mode 100644 arch/x86/include/asm/sembuf.h create mode 100644 arch/x86/include/asm/serial.h create mode 100644 arch/x86/include/asm/setup.h create mode 100644 arch/x86/include/asm/shmbuf.h create mode 100644 arch/x86/include/asm/shmparam.h create mode 100644 arch/x86/include/asm/sigcontext.h create mode 100644 arch/x86/include/asm/sigcontext32.h create mode 100644 arch/x86/include/asm/siginfo.h create mode 100644 arch/x86/include/asm/signal.h create mode 100644 arch/x86/include/asm/smp.h create mode 100644 arch/x86/include/asm/socket.h create mode 100644 arch/x86/include/asm/sockios.h create mode 100644 arch/x86/include/asm/sparsemem.h create mode 100644 arch/x86/include/asm/spinlock.h create mode 100644 arch/x86/include/asm/spinlock_types.h create mode 100644 arch/x86/include/asm/srat.h create mode 100644 arch/x86/include/asm/stacktrace.h create mode 100644 arch/x86/include/asm/stat.h create mode 100644 arch/x86/include/asm/statfs.h create mode 100644 arch/x86/include/asm/string.h create mode 100644 arch/x86/include/asm/string_32.h create mode 100644 arch/x86/include/asm/string_64.h create mode 100644 arch/x86/include/asm/summit/apic.h create mode 100644 arch/x86/include/asm/summit/apicdef.h create mode 100644 arch/x86/include/asm/summit/ipi.h create mode 100644 arch/x86/include/asm/summit/mpparse.h create mode 100644 arch/x86/include/asm/suspend.h create mode 100644 arch/x86/include/asm/suspend_32.h create mode 100644 arch/x86/include/asm/suspend_64.h create mode 100644 arch/x86/include/asm/swiotlb.h create mode 100644 arch/x86/include/asm/sync_bitops.h create mode 100644 arch/x86/include/asm/syscall.h create mode 100644 arch/x86/include/asm/syscalls.h create mode 100644 arch/x86/include/asm/system.h create mode 100644 arch/x86/include/asm/system_64.h create mode 100644 arch/x86/include/asm/tce.h create mode 100644 arch/x86/include/asm/termbits.h create mode 100644 arch/x86/include/asm/termios.h create mode 100644 arch/x86/include/asm/therm_throt.h create mode 100644 arch/x86/include/asm/thread_info.h create mode 100644 arch/x86/include/asm/time.h create mode 100644 arch/x86/include/asm/timer.h create mode 100644 arch/x86/include/asm/timex.h create mode 100644 arch/x86/include/asm/tlb.h create mode 100644 arch/x86/include/asm/tlbflush.h create mode 100644 arch/x86/include/asm/topology.h create mode 100644 arch/x86/include/asm/trampoline.h create mode 100644 arch/x86/include/asm/traps.h create mode 100644 arch/x86/include/asm/tsc.h create mode 100644 arch/x86/include/asm/types.h create mode 100644 arch/x86/include/asm/uaccess.h create mode 100644 arch/x86/include/asm/uaccess_32.h create mode 100644 arch/x86/include/asm/uaccess_64.h create mode 100644 arch/x86/include/asm/ucontext.h create mode 100644 arch/x86/include/asm/unaligned.h create mode 100644 arch/x86/include/asm/unistd.h create mode 100644 arch/x86/include/asm/unistd_32.h create mode 100644 arch/x86/include/asm/unistd_64.h create mode 100644 arch/x86/include/asm/unwind.h create mode 100644 arch/x86/include/asm/user.h create mode 100644 arch/x86/include/asm/user32.h create mode 100644 arch/x86/include/asm/user_32.h create mode 100644 arch/x86/include/asm/user_64.h create mode 100644 arch/x86/include/asm/uv/bios.h create mode 100644 arch/x86/include/asm/uv/uv_bau.h create mode 100644 arch/x86/include/asm/uv/uv_hub.h create mode 100644 arch/x86/include/asm/uv/uv_irq.h create mode 100644 arch/x86/include/asm/uv/uv_mmrs.h create mode 100644 arch/x86/include/asm/vdso.h create mode 100644 arch/x86/include/asm/vga.h create mode 100644 arch/x86/include/asm/vgtod.h create mode 100644 arch/x86/include/asm/vic.h create mode 100644 arch/x86/include/asm/visws/cobalt.h create mode 100644 arch/x86/include/asm/visws/lithium.h create mode 100644 arch/x86/include/asm/visws/piix4.h create mode 100644 arch/x86/include/asm/visws/sgivw.h create mode 100644 arch/x86/include/asm/vm86.h create mode 100644 arch/x86/include/asm/vmi.h create mode 100644 arch/x86/include/asm/vmi_time.h create mode 100644 arch/x86/include/asm/voyager.h create mode 100644 arch/x86/include/asm/vsyscall.h create mode 100644 arch/x86/include/asm/xcr.h create mode 100644 arch/x86/include/asm/xen/events.h create mode 100644 arch/x86/include/asm/xen/grant_table.h create mode 100644 arch/x86/include/asm/xen/hypercall.h create mode 100644 arch/x86/include/asm/xen/hypervisor.h create mode 100644 arch/x86/include/asm/xen/interface.h create mode 100644 arch/x86/include/asm/xen/interface_32.h create mode 100644 arch/x86/include/asm/xen/interface_64.h create mode 100644 arch/x86/include/asm/xen/page.h create mode 100644 arch/x86/include/asm/xor.h create mode 100644 arch/x86/include/asm/xor_32.h create mode 100644 arch/x86/include/asm/xor_64.h create mode 100644 arch/x86/include/asm/xsave.h delete mode 100644 include/asm-x86/Kbuild delete mode 100644 include/asm-x86/a.out-core.h delete mode 100644 include/asm-x86/a.out.h delete mode 100644 include/asm-x86/acpi.h delete mode 100644 include/asm-x86/agp.h delete mode 100644 include/asm-x86/alternative-asm.h delete mode 100644 include/asm-x86/alternative.h delete mode 100644 include/asm-x86/amd_iommu.h delete mode 100644 include/asm-x86/amd_iommu_types.h delete mode 100644 include/asm-x86/apic.h delete mode 100644 include/asm-x86/apicdef.h delete mode 100644 include/asm-x86/arch_hooks.h delete mode 100644 include/asm-x86/asm.h delete mode 100644 include/asm-x86/atomic.h delete mode 100644 include/asm-x86/atomic_32.h delete mode 100644 include/asm-x86/atomic_64.h delete mode 100644 include/asm-x86/auxvec.h delete mode 100644 include/asm-x86/bigsmp/apic.h delete mode 100644 include/asm-x86/bigsmp/apicdef.h delete mode 100644 include/asm-x86/bigsmp/ipi.h delete mode 100644 include/asm-x86/bios_ebda.h delete mode 100644 include/asm-x86/bitops.h delete mode 100644 include/asm-x86/boot.h delete mode 100644 include/asm-x86/bootparam.h delete mode 100644 include/asm-x86/bug.h delete mode 100644 include/asm-x86/bugs.h delete mode 100644 include/asm-x86/byteorder.h delete mode 100644 include/asm-x86/cache.h delete mode 100644 include/asm-x86/cacheflush.h delete mode 100644 include/asm-x86/calgary.h delete mode 100644 include/asm-x86/calling.h delete mode 100644 include/asm-x86/checksum.h delete mode 100644 include/asm-x86/checksum_32.h delete mode 100644 include/asm-x86/checksum_64.h delete mode 100644 include/asm-x86/cmpxchg.h delete mode 100644 include/asm-x86/cmpxchg_32.h delete mode 100644 include/asm-x86/cmpxchg_64.h delete mode 100644 include/asm-x86/compat.h delete mode 100644 include/asm-x86/cpu.h delete mode 100644 include/asm-x86/cpufeature.h delete mode 100644 include/asm-x86/cputime.h delete mode 100644 include/asm-x86/current.h delete mode 100644 include/asm-x86/debugreg.h delete mode 100644 include/asm-x86/delay.h delete mode 100644 include/asm-x86/desc.h delete mode 100644 include/asm-x86/desc_defs.h delete mode 100644 include/asm-x86/device.h delete mode 100644 include/asm-x86/div64.h delete mode 100644 include/asm-x86/dma-mapping.h delete mode 100644 include/asm-x86/dma.h delete mode 100644 include/asm-x86/dmi.h delete mode 100644 include/asm-x86/ds.h delete mode 100644 include/asm-x86/dwarf2.h delete mode 100644 include/asm-x86/e820.h delete mode 100644 include/asm-x86/edac.h delete mode 100644 include/asm-x86/efi.h delete mode 100644 include/asm-x86/elf.h delete mode 100644 include/asm-x86/emergency-restart.h delete mode 100644 include/asm-x86/errno.h delete mode 100644 include/asm-x86/es7000/apic.h delete mode 100644 include/asm-x86/es7000/apicdef.h delete mode 100644 include/asm-x86/es7000/ipi.h delete mode 100644 include/asm-x86/es7000/mpparse.h delete mode 100644 include/asm-x86/es7000/wakecpu.h delete mode 100644 include/asm-x86/fb.h delete mode 100644 include/asm-x86/fcntl.h delete mode 100644 include/asm-x86/fixmap.h delete mode 100644 include/asm-x86/fixmap_32.h delete mode 100644 include/asm-x86/fixmap_64.h delete mode 100644 include/asm-x86/floppy.h delete mode 100644 include/asm-x86/frame.h delete mode 100644 include/asm-x86/ftrace.h delete mode 100644 include/asm-x86/futex.h delete mode 100644 include/asm-x86/gart.h delete mode 100644 include/asm-x86/genapic.h delete mode 100644 include/asm-x86/genapic_32.h delete mode 100644 include/asm-x86/genapic_64.h delete mode 100644 include/asm-x86/geode.h delete mode 100644 include/asm-x86/gpio.h delete mode 100644 include/asm-x86/hardirq.h delete mode 100644 include/asm-x86/hardirq_32.h delete mode 100644 include/asm-x86/hardirq_64.h delete mode 100644 include/asm-x86/highmem.h delete mode 100644 include/asm-x86/hpet.h delete mode 100644 include/asm-x86/hugetlb.h delete mode 100644 include/asm-x86/hw_irq.h delete mode 100644 include/asm-x86/hypertransport.h delete mode 100644 include/asm-x86/i387.h delete mode 100644 include/asm-x86/i8253.h delete mode 100644 include/asm-x86/i8259.h delete mode 100644 include/asm-x86/ia32.h delete mode 100644 include/asm-x86/ia32_unistd.h delete mode 100644 include/asm-x86/idle.h delete mode 100644 include/asm-x86/intel_arch_perfmon.h delete mode 100644 include/asm-x86/io.h delete mode 100644 include/asm-x86/io_32.h delete mode 100644 include/asm-x86/io_64.h delete mode 100644 include/asm-x86/io_apic.h delete mode 100644 include/asm-x86/ioctl.h delete mode 100644 include/asm-x86/ioctls.h delete mode 100644 include/asm-x86/iommu.h delete mode 100644 include/asm-x86/ipcbuf.h delete mode 100644 include/asm-x86/ipi.h delete mode 100644 include/asm-x86/irq.h delete mode 100644 include/asm-x86/irq_regs.h delete mode 100644 include/asm-x86/irq_regs_32.h delete mode 100644 include/asm-x86/irq_regs_64.h delete mode 100644 include/asm-x86/irq_remapping.h delete mode 100644 include/asm-x86/irq_vectors.h delete mode 100644 include/asm-x86/irqflags.h delete mode 100644 include/asm-x86/ist.h delete mode 100644 include/asm-x86/k8.h delete mode 100644 include/asm-x86/kdebug.h delete mode 100644 include/asm-x86/kexec.h delete mode 100644 include/asm-x86/kgdb.h delete mode 100644 include/asm-x86/kmap_types.h delete mode 100644 include/asm-x86/kprobes.h delete mode 100644 include/asm-x86/kvm.h delete mode 100644 include/asm-x86/kvm_host.h delete mode 100644 include/asm-x86/kvm_para.h delete mode 100644 include/asm-x86/kvm_x86_emulate.h delete mode 100644 include/asm-x86/ldt.h delete mode 100644 include/asm-x86/lguest.h delete mode 100644 include/asm-x86/lguest_hcall.h delete mode 100644 include/asm-x86/linkage.h delete mode 100644 include/asm-x86/local.h delete mode 100644 include/asm-x86/mach-default/apm.h delete mode 100644 include/asm-x86/mach-default/do_timer.h delete mode 100644 include/asm-x86/mach-default/entry_arch.h delete mode 100644 include/asm-x86/mach-default/mach_apic.h delete mode 100644 include/asm-x86/mach-default/mach_apicdef.h delete mode 100644 include/asm-x86/mach-default/mach_ipi.h delete mode 100644 include/asm-x86/mach-default/mach_mpparse.h delete mode 100644 include/asm-x86/mach-default/mach_mpspec.h delete mode 100644 include/asm-x86/mach-default/mach_timer.h delete mode 100644 include/asm-x86/mach-default/mach_traps.h delete mode 100644 include/asm-x86/mach-default/mach_wakecpu.h delete mode 100644 include/asm-x86/mach-default/pci-functions.h delete mode 100644 include/asm-x86/mach-default/setup_arch.h delete mode 100644 include/asm-x86/mach-default/smpboot_hooks.h delete mode 100644 include/asm-x86/mach-generic/gpio.h delete mode 100644 include/asm-x86/mach-generic/mach_apic.h delete mode 100644 include/asm-x86/mach-generic/mach_apicdef.h delete mode 100644 include/asm-x86/mach-generic/mach_ipi.h delete mode 100644 include/asm-x86/mach-generic/mach_mpparse.h delete mode 100644 include/asm-x86/mach-generic/mach_mpspec.h delete mode 100644 include/asm-x86/mach-rdc321x/gpio.h delete mode 100644 include/asm-x86/mach-rdc321x/rdc321x_defs.h delete mode 100644 include/asm-x86/mach-voyager/do_timer.h delete mode 100644 include/asm-x86/mach-voyager/entry_arch.h delete mode 100644 include/asm-x86/mach-voyager/setup_arch.h delete mode 100644 include/asm-x86/math_emu.h delete mode 100644 include/asm-x86/mc146818rtc.h delete mode 100644 include/asm-x86/mca.h delete mode 100644 include/asm-x86/mca_dma.h delete mode 100644 include/asm-x86/mce.h delete mode 100644 include/asm-x86/microcode.h delete mode 100644 include/asm-x86/mman.h delete mode 100644 include/asm-x86/mmconfig.h delete mode 100644 include/asm-x86/mmu.h delete mode 100644 include/asm-x86/mmu_context.h delete mode 100644 include/asm-x86/mmu_context_32.h delete mode 100644 include/asm-x86/mmu_context_64.h delete mode 100644 include/asm-x86/mmx.h delete mode 100644 include/asm-x86/mmzone.h delete mode 100644 include/asm-x86/mmzone_32.h delete mode 100644 include/asm-x86/mmzone_64.h delete mode 100644 include/asm-x86/module.h delete mode 100644 include/asm-x86/mpspec.h delete mode 100644 include/asm-x86/mpspec_def.h delete mode 100644 include/asm-x86/msgbuf.h delete mode 100644 include/asm-x86/msidef.h delete mode 100644 include/asm-x86/msr-index.h delete mode 100644 include/asm-x86/msr.h delete mode 100644 include/asm-x86/mtrr.h delete mode 100644 include/asm-x86/mutex.h delete mode 100644 include/asm-x86/mutex_32.h delete mode 100644 include/asm-x86/mutex_64.h delete mode 100644 include/asm-x86/nmi.h delete mode 100644 include/asm-x86/nops.h delete mode 100644 include/asm-x86/numa.h delete mode 100644 include/asm-x86/numa_32.h delete mode 100644 include/asm-x86/numa_64.h delete mode 100644 include/asm-x86/numaq.h delete mode 100644 include/asm-x86/numaq/apic.h delete mode 100644 include/asm-x86/numaq/apicdef.h delete mode 100644 include/asm-x86/numaq/ipi.h delete mode 100644 include/asm-x86/numaq/mpparse.h delete mode 100644 include/asm-x86/numaq/wakecpu.h delete mode 100644 include/asm-x86/olpc.h delete mode 100644 include/asm-x86/page.h delete mode 100644 include/asm-x86/page_32.h delete mode 100644 include/asm-x86/page_64.h delete mode 100644 include/asm-x86/param.h delete mode 100644 include/asm-x86/paravirt.h delete mode 100644 include/asm-x86/parport.h delete mode 100644 include/asm-x86/pat.h delete mode 100644 include/asm-x86/pci-direct.h delete mode 100644 include/asm-x86/pci.h delete mode 100644 include/asm-x86/pci_32.h delete mode 100644 include/asm-x86/pci_64.h delete mode 100644 include/asm-x86/pda.h delete mode 100644 include/asm-x86/percpu.h delete mode 100644 include/asm-x86/pgalloc.h delete mode 100644 include/asm-x86/pgtable-2level-defs.h delete mode 100644 include/asm-x86/pgtable-2level.h delete mode 100644 include/asm-x86/pgtable-3level-defs.h delete mode 100644 include/asm-x86/pgtable-3level.h delete mode 100644 include/asm-x86/pgtable.h delete mode 100644 include/asm-x86/pgtable_32.h delete mode 100644 include/asm-x86/pgtable_64.h delete mode 100644 include/asm-x86/poll.h delete mode 100644 include/asm-x86/posix_types.h delete mode 100644 include/asm-x86/posix_types_32.h delete mode 100644 include/asm-x86/posix_types_64.h delete mode 100644 include/asm-x86/prctl.h delete mode 100644 include/asm-x86/processor-cyrix.h delete mode 100644 include/asm-x86/processor-flags.h delete mode 100644 include/asm-x86/processor.h delete mode 100644 include/asm-x86/proto.h delete mode 100644 include/asm-x86/ptrace-abi.h delete mode 100644 include/asm-x86/ptrace.h delete mode 100644 include/asm-x86/pvclock-abi.h delete mode 100644 include/asm-x86/pvclock.h delete mode 100644 include/asm-x86/reboot.h delete mode 100644 include/asm-x86/reboot_fixups.h delete mode 100644 include/asm-x86/required-features.h delete mode 100644 include/asm-x86/resource.h delete mode 100644 include/asm-x86/resume-trace.h delete mode 100644 include/asm-x86/rio.h delete mode 100644 include/asm-x86/rtc.h delete mode 100644 include/asm-x86/rwlock.h delete mode 100644 include/asm-x86/rwsem.h delete mode 100644 include/asm-x86/scatterlist.h delete mode 100644 include/asm-x86/seccomp.h delete mode 100644 include/asm-x86/seccomp_32.h delete mode 100644 include/asm-x86/seccomp_64.h delete mode 100644 include/asm-x86/sections.h delete mode 100644 include/asm-x86/segment.h delete mode 100644 include/asm-x86/sembuf.h delete mode 100644 include/asm-x86/serial.h delete mode 100644 include/asm-x86/setup.h delete mode 100644 include/asm-x86/shmbuf.h delete mode 100644 include/asm-x86/shmparam.h delete mode 100644 include/asm-x86/sigcontext.h delete mode 100644 include/asm-x86/sigcontext32.h delete mode 100644 include/asm-x86/siginfo.h delete mode 100644 include/asm-x86/signal.h delete mode 100644 include/asm-x86/smp.h delete mode 100644 include/asm-x86/socket.h delete mode 100644 include/asm-x86/sockios.h delete mode 100644 include/asm-x86/sparsemem.h delete mode 100644 include/asm-x86/spinlock.h delete mode 100644 include/asm-x86/spinlock_types.h delete mode 100644 include/asm-x86/srat.h delete mode 100644 include/asm-x86/stacktrace.h delete mode 100644 include/asm-x86/stat.h delete mode 100644 include/asm-x86/statfs.h delete mode 100644 include/asm-x86/string.h delete mode 100644 include/asm-x86/string_32.h delete mode 100644 include/asm-x86/string_64.h delete mode 100644 include/asm-x86/summit/apic.h delete mode 100644 include/asm-x86/summit/apicdef.h delete mode 100644 include/asm-x86/summit/ipi.h delete mode 100644 include/asm-x86/summit/mpparse.h delete mode 100644 include/asm-x86/suspend.h delete mode 100644 include/asm-x86/suspend_32.h delete mode 100644 include/asm-x86/suspend_64.h delete mode 100644 include/asm-x86/swiotlb.h delete mode 100644 include/asm-x86/sync_bitops.h delete mode 100644 include/asm-x86/syscall.h delete mode 100644 include/asm-x86/syscalls.h delete mode 100644 include/asm-x86/system.h delete mode 100644 include/asm-x86/system_64.h delete mode 100644 include/asm-x86/tce.h delete mode 100644 include/asm-x86/termbits.h delete mode 100644 include/asm-x86/termios.h delete mode 100644 include/asm-x86/therm_throt.h delete mode 100644 include/asm-x86/thread_info.h delete mode 100644 include/asm-x86/time.h delete mode 100644 include/asm-x86/timer.h delete mode 100644 include/asm-x86/timex.h delete mode 100644 include/asm-x86/tlb.h delete mode 100644 include/asm-x86/tlbflush.h delete mode 100644 include/asm-x86/topology.h delete mode 100644 include/asm-x86/trampoline.h delete mode 100644 include/asm-x86/traps.h delete mode 100644 include/asm-x86/tsc.h delete mode 100644 include/asm-x86/types.h delete mode 100644 include/asm-x86/uaccess.h delete mode 100644 include/asm-x86/uaccess_32.h delete mode 100644 include/asm-x86/uaccess_64.h delete mode 100644 include/asm-x86/ucontext.h delete mode 100644 include/asm-x86/unaligned.h delete mode 100644 include/asm-x86/unistd.h delete mode 100644 include/asm-x86/unistd_32.h delete mode 100644 include/asm-x86/unistd_64.h delete mode 100644 include/asm-x86/unwind.h delete mode 100644 include/asm-x86/user.h delete mode 100644 include/asm-x86/user32.h delete mode 100644 include/asm-x86/user_32.h delete mode 100644 include/asm-x86/user_64.h delete mode 100644 include/asm-x86/uv/bios.h delete mode 100644 include/asm-x86/uv/uv_bau.h delete mode 100644 include/asm-x86/uv/uv_hub.h delete mode 100644 include/asm-x86/uv/uv_irq.h delete mode 100644 include/asm-x86/uv/uv_mmrs.h delete mode 100644 include/asm-x86/vdso.h delete mode 100644 include/asm-x86/vga.h delete mode 100644 include/asm-x86/vgtod.h delete mode 100644 include/asm-x86/vic.h delete mode 100644 include/asm-x86/visws/cobalt.h delete mode 100644 include/asm-x86/visws/lithium.h delete mode 100644 include/asm-x86/visws/piix4.h delete mode 100644 include/asm-x86/visws/sgivw.h delete mode 100644 include/asm-x86/vm86.h delete mode 100644 include/asm-x86/vmi.h delete mode 100644 include/asm-x86/vmi_time.h delete mode 100644 include/asm-x86/voyager.h delete mode 100644 include/asm-x86/vsyscall.h delete mode 100644 include/asm-x86/xcr.h delete mode 100644 include/asm-x86/xen/events.h delete mode 100644 include/asm-x86/xen/grant_table.h delete mode 100644 include/asm-x86/xen/hypercall.h delete mode 100644 include/asm-x86/xen/hypervisor.h delete mode 100644 include/asm-x86/xen/interface.h delete mode 100644 include/asm-x86/xen/interface_32.h delete mode 100644 include/asm-x86/xen/interface_64.h delete mode 100644 include/asm-x86/xen/page.h delete mode 100644 include/asm-x86/xor.h delete mode 100644 include/asm-x86/xor_32.h delete mode 100644 include/asm-x86/xor_64.h delete mode 100644 include/asm-x86/xsave.h (limited to 'arch/x86') diff --git a/arch/ia64/ia32/audit.c b/arch/ia64/ia32/audit.c index 5e901c75df1..5c93ddd1e42 100644 --- a/arch/ia64/ia32/audit.c +++ b/arch/ia64/ia32/audit.c @@ -1,4 +1,4 @@ -#include +#include "../../x86/include/asm/unistd_32.h" unsigned ia32_dir_class[] = { #include diff --git a/arch/um/Makefile b/arch/um/Makefile index 94fb24ea5af..4fc79f3030e 100644 --- a/arch/um/Makefile +++ b/arch/um/Makefile @@ -156,9 +156,9 @@ $(ARCH_DIR)/include/asm/arch: @echo ' SYMLINK $@' ifneq ($(KBUILD_SRC),) $(Q)mkdir -p $(objtree)/$(ARCH_DIR)/include/asm - $(Q)ln -fsn $(srctree)/include/asm-$(HEADER_ARCH) $@ + $(Q)ln -fsn $(srctree)/arch/$(HEADER_ARCH)/include/asm $@ else - $(Q)ln -fsn ../../../../include/asm-$(HEADER_ARCH) $@ + $(Q)ln -fsn ../../../$(HEADER_ARCH)/include/asm $@ endif $(objtree)/$(ARCH_DIR)/include/shared: diff --git a/arch/um/sys-x86_64/syscall_table.c b/arch/um/sys-x86_64/syscall_table.c index 32f5fbe2d0d..ef42ec01eaf 100644 --- a/arch/um/sys-x86_64/syscall_table.c +++ b/arch/um/sys-x86_64/syscall_table.c @@ -42,7 +42,7 @@ #define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; #undef ASM_X86__UNISTD_64_H -#include +#include "../../x86/include/asm/unistd_64.h" #undef __SYSCALL #define __SYSCALL(nr, sym) [ nr ] = sym, @@ -64,7 +64,7 @@ extern void sys_ni_syscall(void); */ sys_call_ptr_t sys_call_table[] __cacheline_aligned = { -#include +#include "../../x86/include/asm/unistd_64.h" }; int syscall_table_size = sizeof(sys_call_table); diff --git a/arch/x86/Makefile b/arch/x86/Makefile index f5631da585b..d1a47adb5ae 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -110,16 +110,16 @@ KBUILD_CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) mcore-y := arch/x86/mach-default/ # Voyager subarch support -mflags-$(CONFIG_X86_VOYAGER) := -Iinclude/asm-x86/mach-voyager +mflags-$(CONFIG_X86_VOYAGER) := -Iarch/x86/include/asm/mach-voyager mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager/ # generic subarchitecture -mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic +mflags-$(CONFIG_X86_GENERICARCH):= -Iarch/x86/include/asm/mach-generic fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/ mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default/ # default subarch .h files -mflags-y += -Iinclude/asm-x86/mach-default +mflags-y += -Iarch/x86/include/asm/mach-default # 64 bit does not support subarch support - clear sub arch variables fcore-$(CONFIG_X86_64) := diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild new file mode 100644 index 00000000000..4a8e80cdcfa --- /dev/null +++ b/arch/x86/include/asm/Kbuild @@ -0,0 +1,24 @@ +include include/asm-generic/Kbuild.asm + +header-y += boot.h +header-y += bootparam.h +header-y += debugreg.h +header-y += ldt.h +header-y += msr-index.h +header-y += prctl.h +header-y += ptrace-abi.h +header-y += sigcontext32.h +header-y += ucontext.h +header-y += processor-flags.h + +unifdef-y += e820.h +unifdef-y += ist.h +unifdef-y += mce.h +unifdef-y += msr.h +unifdef-y += mtrr.h +unifdef-y += posix_types_32.h +unifdef-y += posix_types_64.h +unifdef-y += unistd_32.h +unifdef-y += unistd_64.h +unifdef-y += vm86.h +unifdef-y += vsyscall.h diff --git a/arch/x86/include/asm/a.out-core.h b/arch/x86/include/asm/a.out-core.h new file mode 100644 index 00000000000..f5705761a37 --- /dev/null +++ b/arch/x86/include/asm/a.out-core.h @@ -0,0 +1,73 @@ +/* a.out coredump register dumper + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#ifndef ASM_X86__A_OUT_CORE_H +#define ASM_X86__A_OUT_CORE_H + +#ifdef __KERNEL__ +#ifdef CONFIG_X86_32 + +#include +#include + +/* + * fill in the user structure for an a.out core dump + */ +static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump) +{ + u16 gs; + +/* changed the size calculations - should hopefully work better. lbt */ + dump->magic = CMAGIC; + dump->start_code = 0; + dump->start_stack = regs->sp & ~(PAGE_SIZE - 1); + dump->u_tsize = ((unsigned long)current->mm->end_code) >> PAGE_SHIFT; + dump->u_dsize = ((unsigned long)(current->mm->brk + (PAGE_SIZE - 1))) + >> PAGE_SHIFT; + dump->u_dsize -= dump->u_tsize; + dump->u_ssize = 0; + dump->u_debugreg[0] = current->thread.debugreg0; + dump->u_debugreg[1] = current->thread.debugreg1; + dump->u_debugreg[2] = current->thread.debugreg2; + dump->u_debugreg[3] = current->thread.debugreg3; + dump->u_debugreg[4] = 0; + dump->u_debugreg[5] = 0; + dump->u_debugreg[6] = current->thread.debugreg6; + dump->u_debugreg[7] = current->thread.debugreg7; + + if (dump->start_stack < TASK_SIZE) + dump->u_ssize = ((unsigned long)(TASK_SIZE - dump->start_stack)) + >> PAGE_SHIFT; + + dump->regs.bx = regs->bx; + dump->regs.cx = regs->cx; + dump->regs.dx = regs->dx; + dump->regs.si = regs->si; + dump->regs.di = regs->di; + dump->regs.bp = regs->bp; + dump->regs.ax = regs->ax; + dump->regs.ds = (u16)regs->ds; + dump->regs.es = (u16)regs->es; + dump->regs.fs = (u16)regs->fs; + savesegment(gs, gs); + dump->regs.orig_ax = regs->orig_ax; + dump->regs.ip = regs->ip; + dump->regs.cs = (u16)regs->cs; + dump->regs.flags = regs->flags; + dump->regs.sp = regs->sp; + dump->regs.ss = (u16)regs->ss; + + dump->u_fpvalid = dump_fpu(regs, &dump->i387); +} + +#endif /* CONFIG_X86_32 */ +#endif /* __KERNEL__ */ +#endif /* ASM_X86__A_OUT_CORE_H */ diff --git a/arch/x86/include/asm/a.out.h b/arch/x86/include/asm/a.out.h new file mode 100644 index 00000000000..0948748bc69 --- /dev/null +++ b/arch/x86/include/asm/a.out.h @@ -0,0 +1,20 @@ +#ifndef ASM_X86__A_OUT_H +#define ASM_X86__A_OUT_H + +struct exec +{ + unsigned int a_info; /* Use macros N_MAGIC, etc for access */ + unsigned a_text; /* length of text, in bytes */ + unsigned a_data; /* length of data, in bytes */ + unsigned a_bss; /* length of uninitialized data area for file, in bytes */ + unsigned a_syms; /* length of symbol table data in file, in bytes */ + unsigned a_entry; /* start address */ + unsigned a_trsize; /* length of relocation info for text, in bytes */ + unsigned a_drsize; /* length of relocation info for data, in bytes */ +}; + +#define N_TRSIZE(a) ((a).a_trsize) +#define N_DRSIZE(a) ((a).a_drsize) +#define N_SYMSIZE(a) ((a).a_syms) + +#endif /* ASM_X86__A_OUT_H */ diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h new file mode 100644 index 00000000000..392e17336be --- /dev/null +++ b/arch/x86/include/asm/acpi.h @@ -0,0 +1,178 @@ +#ifndef ASM_X86__ACPI_H +#define ASM_X86__ACPI_H + +/* + * Copyright (C) 2001 Paul Diefenbaugh + * Copyright (C) 2001 Patrick Mochel + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ +#include + +#include +#include +#include +#include + +#define COMPILER_DEPENDENT_INT64 long long +#define COMPILER_DEPENDENT_UINT64 unsigned long long + +/* + * Calling conventions: + * + * ACPI_SYSTEM_XFACE - Interfaces to host OS (handlers, threads) + * ACPI_EXTERNAL_XFACE - External ACPI interfaces + * ACPI_INTERNAL_XFACE - Internal ACPI interfaces + * ACPI_INTERNAL_VAR_XFACE - Internal variable-parameter list interfaces + */ +#define ACPI_SYSTEM_XFACE +#define ACPI_EXTERNAL_XFACE +#define ACPI_INTERNAL_XFACE +#define ACPI_INTERNAL_VAR_XFACE + +/* Asm macros */ + +#define ACPI_ASM_MACROS +#define BREAKPOINT3 +#define ACPI_DISABLE_IRQS() local_irq_disable() +#define ACPI_ENABLE_IRQS() local_irq_enable() +#define ACPI_FLUSH_CPU_CACHE() wbinvd() + +int __acpi_acquire_global_lock(unsigned int *lock); +int __acpi_release_global_lock(unsigned int *lock); + +#define ACPI_ACQUIRE_GLOBAL_LOCK(facs, Acq) \ + ((Acq) = __acpi_acquire_global_lock(&facs->global_lock)) + +#define ACPI_RELEASE_GLOBAL_LOCK(facs, Acq) \ + ((Acq) = __acpi_release_global_lock(&facs->global_lock)) + +/* + * Math helper asm macros + */ +#define ACPI_DIV_64_BY_32(n_hi, n_lo, d32, q32, r32) \ + asm("divl %2;" \ + : "=a"(q32), "=d"(r32) \ + : "r"(d32), \ + "0"(n_lo), "1"(n_hi)) + + +#define ACPI_SHIFT_RIGHT_64(n_hi, n_lo) \ + asm("shrl $1,%2 ;" \ + "rcrl $1,%3;" \ + : "=r"(n_hi), "=r"(n_lo) \ + : "0"(n_hi), "1"(n_lo)) + +#ifdef CONFIG_ACPI +extern int acpi_lapic; +extern int acpi_ioapic; +extern int acpi_noirq; +extern int acpi_strict; +extern int acpi_disabled; +extern int acpi_ht; +extern int acpi_pci_disabled; +extern int acpi_skip_timer_override; +extern int acpi_use_timer_override; + +extern u8 acpi_sci_flags; +extern int acpi_sci_override_gsi; +void acpi_pic_sci_set_trigger(unsigned int, u16); + +static inline void disable_acpi(void) +{ + acpi_disabled = 1; + acpi_ht = 0; + acpi_pci_disabled = 1; + acpi_noirq = 1; +} + +/* Fixmap pages to reserve for ACPI boot-time tables (see fixmap.h) */ +#define FIX_ACPI_PAGES 4 + +extern int acpi_gsi_to_irq(u32 gsi, unsigned int *irq); + +static inline void acpi_noirq_set(void) { acpi_noirq = 1; } +static inline void acpi_disable_pci(void) +{ + acpi_pci_disabled = 1; + acpi_noirq_set(); +} +extern int acpi_irq_balance_set(char *str); + +/* routines for saving/restoring kernel state */ +extern int acpi_save_state_mem(void); +extern void acpi_restore_state_mem(void); + +extern unsigned long acpi_wakeup_address; + +/* early initialization routine */ +extern void acpi_reserve_bootmem(void); + +/* + * Check if the CPU can handle C2 and deeper + */ +static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate) +{ + /* + * Early models (<=5) of AMD Opterons are not supposed to go into + * C2 state. + * + * Steppings 0x0A and later are good + */ + if (boot_cpu_data.x86 == 0x0F && + boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + boot_cpu_data.x86_model <= 0x05 && + boot_cpu_data.x86_mask < 0x0A) + return 1; + else if (boot_cpu_has(X86_FEATURE_AMDC1E)) + return 1; + else + return max_cstate; +} + +#else /* !CONFIG_ACPI */ + +#define acpi_lapic 0 +#define acpi_ioapic 0 +static inline void acpi_noirq_set(void) { } +static inline void acpi_disable_pci(void) { } +static inline void disable_acpi(void) { } + +#endif /* !CONFIG_ACPI */ + +#define ARCH_HAS_POWER_INIT 1 + +struct bootnode; + +#ifdef CONFIG_ACPI_NUMA +extern int acpi_numa; +extern int acpi_scan_nodes(unsigned long start, unsigned long end); +#define NR_NODE_MEMBLKS (MAX_NUMNODES*2) +extern void acpi_fake_nodes(const struct bootnode *fake_nodes, + int num_nodes); +#else +static inline void acpi_fake_nodes(const struct bootnode *fake_nodes, + int num_nodes) +{ +} +#endif + +#define acpi_unlazy_tlb(x) leave_mm(x) + +#endif /* ASM_X86__ACPI_H */ diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h new file mode 100644 index 00000000000..3617fd4fcdf --- /dev/null +++ b/arch/x86/include/asm/agp.h @@ -0,0 +1,35 @@ +#ifndef ASM_X86__AGP_H +#define ASM_X86__AGP_H + +#include +#include + +/* + * Functions to keep the agpgart mappings coherent with the MMU. The + * GART gives the CPU a physical alias of pages in memory. The alias + * region is mapped uncacheable. Make sure there are no conflicting + * mappings with different cachability attributes for the same + * page. This avoids data corruption on some CPUs. + */ + +#define map_page_into_agp(page) set_pages_uc(page, 1) +#define unmap_page_from_agp(page) set_pages_wb(page, 1) + +/* + * Could use CLFLUSH here if the cpu supports it. But then it would + * need to be called for each cacheline of the whole page so it may + * not be worth it. Would need a page for it. + */ +#define flush_agp_cache() wbinvd() + +/* Convert a physical address to an address suitable for the GART. */ +#define phys_to_gart(x) (x) +#define gart_to_phys(x) (x) + +/* GATT allocation. Returns/accepts GATT kernel virtual address. */ +#define alloc_gatt_pages(order) \ + ((char *)__get_free_pages(GFP_KERNEL, (order))) +#define free_gatt_pages(table, order) \ + free_pages((unsigned long)(table), (order)) + +#endif /* ASM_X86__AGP_H */ diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h new file mode 100644 index 00000000000..e2077d343c3 --- /dev/null +++ b/arch/x86/include/asm/alternative-asm.h @@ -0,0 +1,22 @@ +#ifdef __ASSEMBLY__ + +#ifdef CONFIG_X86_32 +# define X86_ALIGN .long +#else +# define X86_ALIGN .quad +#endif + +#ifdef CONFIG_SMP + .macro LOCK_PREFIX +1: lock + .section .smp_locks,"a" + .align 4 + X86_ALIGN 1b + .previous + .endm +#else + .macro LOCK_PREFIX + .endm +#endif + +#endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h new file mode 100644 index 00000000000..22d3c9862bf --- /dev/null +++ b/arch/x86/include/asm/alternative.h @@ -0,0 +1,183 @@ +#ifndef ASM_X86__ALTERNATIVE_H +#define ASM_X86__ALTERNATIVE_H + +#include +#include +#include + +/* + * Alternative inline assembly for SMP. + * + * The LOCK_PREFIX macro defined here replaces the LOCK and + * LOCK_PREFIX macros used everywhere in the source tree. + * + * SMP alternatives use the same data structures as the other + * alternatives and the X86_FEATURE_UP flag to indicate the case of a + * UP system running a SMP kernel. The existing apply_alternatives() + * works fine for patching a SMP kernel for UP. + * + * The SMP alternative tables can be kept after boot and contain both + * UP and SMP versions of the instructions to allow switching back to + * SMP at runtime, when hotplugging in a new CPU, which is especially + * useful in virtualized environments. + * + * The very common lock prefix is handled as special case in a + * separate table which is a pure address list without replacement ptr + * and size information. That keeps the table sizes small. + */ + +#ifdef CONFIG_SMP +#define LOCK_PREFIX \ + ".section .smp_locks,\"a\"\n" \ + _ASM_ALIGN "\n" \ + _ASM_PTR "661f\n" /* address */ \ + ".previous\n" \ + "661:\n\tlock; " + +#else /* ! CONFIG_SMP */ +#define LOCK_PREFIX "" +#endif + +/* This must be included *after* the definition of LOCK_PREFIX */ +#include + +struct alt_instr { + u8 *instr; /* original instruction */ + u8 *replacement; + u8 cpuid; /* cpuid bit set for replacement */ + u8 instrlen; /* length of original instruction */ + u8 replacementlen; /* length of new instruction, <= instrlen */ + u8 pad1; +#ifdef CONFIG_X86_64 + u32 pad2; +#endif +}; + +extern void alternative_instructions(void); +extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); + +struct module; + +#ifdef CONFIG_SMP +extern void alternatives_smp_module_add(struct module *mod, char *name, + void *locks, void *locks_end, + void *text, void *text_end); +extern void alternatives_smp_module_del(struct module *mod); +extern void alternatives_smp_switch(int smp); +#else +static inline void alternatives_smp_module_add(struct module *mod, char *name, + void *locks, void *locks_end, + void *text, void *text_end) {} +static inline void alternatives_smp_module_del(struct module *mod) {} +static inline void alternatives_smp_switch(int smp) {} +#endif /* CONFIG_SMP */ + +const unsigned char *const *find_nop_table(void); + +/* + * Alternative instructions for different CPU types or capabilities. + * + * This allows to use optimized instructions even on generic binary + * kernels. + * + * length of oldinstr must be longer or equal the length of newinstr + * It can be padded with nops as needed. + * + * For non barrier like inlines please define new variants + * without volatile and memory clobber. + */ +#define alternative(oldinstr, newinstr, feature) \ + asm volatile ("661:\n\t" oldinstr "\n662:\n" \ + ".section .altinstructions,\"a\"\n" \ + _ASM_ALIGN "\n" \ + _ASM_PTR "661b\n" /* label */ \ + _ASM_PTR "663f\n" /* new instruction */ \ + " .byte %c0\n" /* feature bit */ \ + " .byte 662b-661b\n" /* sourcelen */ \ + " .byte 664f-663f\n" /* replacementlen */ \ + ".previous\n" \ + ".section .altinstr_replacement,\"ax\"\n" \ + "663:\n\t" newinstr "\n664:\n" /* replacement */ \ + ".previous" :: "i" (feature) : "memory") + +/* + * Alternative inline assembly with input. + * + * Pecularities: + * No memory clobber here. + * Argument numbers start with 1. + * Best is to use constraints that are fixed size (like (%1) ... "r") + * If you use variable sized constraints like "m" or "g" in the + * replacement make sure to pad to the worst case length. + */ +#define alternative_input(oldinstr, newinstr, feature, input...) \ + asm volatile ("661:\n\t" oldinstr "\n662:\n" \ + ".section .altinstructions,\"a\"\n" \ + _ASM_ALIGN "\n" \ + _ASM_PTR "661b\n" /* label */ \ + _ASM_PTR "663f\n" /* new instruction */ \ + " .byte %c0\n" /* feature bit */ \ + " .byte 662b-661b\n" /* sourcelen */ \ + " .byte 664f-663f\n" /* replacementlen */ \ + ".previous\n" \ + ".section .altinstr_replacement,\"ax\"\n" \ + "663:\n\t" newinstr "\n664:\n" /* replacement */ \ + ".previous" :: "i" (feature), ##input) + +/* Like alternative_input, but with a single output argument */ +#define alternative_io(oldinstr, newinstr, feature, output, input...) \ + asm volatile ("661:\n\t" oldinstr "\n662:\n" \ + ".section .altinstructions,\"a\"\n" \ + _ASM_ALIGN "\n" \ + _ASM_PTR "661b\n" /* label */ \ + _ASM_PTR "663f\n" /* new instruction */ \ + " .byte %c[feat]\n" /* feature bit */ \ + " .byte 662b-661b\n" /* sourcelen */ \ + " .byte 664f-663f\n" /* replacementlen */ \ + ".previous\n" \ + ".section .altinstr_replacement,\"ax\"\n" \ + "663:\n\t" newinstr "\n664:\n" /* replacement */ \ + ".previous" : output : [feat] "i" (feature), ##input) + +/* + * use this macro(s) if you need more than one output parameter + * in alternative_io + */ +#define ASM_OUTPUT2(a, b) a, b + +struct paravirt_patch_site; +#ifdef CONFIG_PARAVIRT +void apply_paravirt(struct paravirt_patch_site *start, + struct paravirt_patch_site *end); +#else +static inline void apply_paravirt(struct paravirt_patch_site *start, + struct paravirt_patch_site *end) +{} +#define __parainstructions NULL +#define __parainstructions_end NULL +#endif + +extern void add_nops(void *insns, unsigned int len); + +/* + * Clear and restore the kernel write-protection flag on the local CPU. + * Allows the kernel to edit read-only pages. + * Side-effect: any interrupt handler running between save and restore will have + * the ability to write to read-only pages. + * + * Warning: + * Code patching in the UP case is safe if NMIs and MCE handlers are stopped and + * no thread can be preempted in the instructions being modified (no iret to an + * invalid instruction possible) or if the instructions are changed from a + * consistent state to another consistent state atomically. + * More care must be taken when modifying code in the SMP case because of + * Intel's errata. + * On the local CPU you need to be protected again NMI or MCE handlers seeing an + * inconsistent instruction while you patch. + * The _early version expects the memory to already be RW. + */ + +extern void *text_poke(void *addr, const void *opcode, size_t len); +extern void *text_poke_early(void *addr, const void *opcode, size_t len); + +#endif /* ASM_X86__ALTERNATIVE_H */ diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h new file mode 100644 index 00000000000..041d0db7da2 --- /dev/null +++ b/arch/x86/include/asm/amd_iommu.h @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. + * Author: Joerg Roedel + * Leo Duran + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef ASM_X86__AMD_IOMMU_H +#define ASM_X86__AMD_IOMMU_H + +#include + +#ifdef CONFIG_AMD_IOMMU +extern int amd_iommu_init(void); +extern int amd_iommu_init_dma_ops(void); +extern void amd_iommu_detect(void); +extern irqreturn_t amd_iommu_int_handler(int irq, void *data); +#else +static inline int amd_iommu_init(void) { return -ENODEV; } +static inline void amd_iommu_detect(void) { } +#endif + +#endif /* ASM_X86__AMD_IOMMU_H */ diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h new file mode 100644 index 00000000000..b3085869a17 --- /dev/null +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -0,0 +1,404 @@ +/* + * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. + * Author: Joerg Roedel + * Leo Duran + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef ASM_X86__AMD_IOMMU_TYPES_H +#define ASM_X86__AMD_IOMMU_TYPES_H + +#include +#include +#include + +/* + * some size calculation constants + */ +#define DEV_TABLE_ENTRY_SIZE 32 +#define ALIAS_TABLE_ENTRY_SIZE 2 +#define RLOOKUP_TABLE_ENTRY_SIZE (sizeof(void *)) + +/* Length of the MMIO region for the AMD IOMMU */ +#define MMIO_REGION_LENGTH 0x4000 + +/* Capability offsets used by the driver */ +#define MMIO_CAP_HDR_OFFSET 0x00 +#define MMIO_RANGE_OFFSET 0x0c +#define MMIO_MISC_OFFSET 0x10 + +/* Masks, shifts and macros to parse the device range capability */ +#define MMIO_RANGE_LD_MASK 0xff000000 +#define MMIO_RANGE_FD_MASK 0x00ff0000 +#define MMIO_RANGE_BUS_MASK 0x0000ff00 +#define MMIO_RANGE_LD_SHIFT 24 +#define MMIO_RANGE_FD_SHIFT 16 +#define MMIO_RANGE_BUS_SHIFT 8 +#define MMIO_GET_LD(x) (((x) & MMIO_RANGE_LD_MASK) >> MMIO_RANGE_LD_SHIFT) +#define MMIO_GET_FD(x) (((x) & MMIO_RANGE_FD_MASK) >> MMIO_RANGE_FD_SHIFT) +#define MMIO_GET_BUS(x) (((x) & MMIO_RANGE_BUS_MASK) >> MMIO_RANGE_BUS_SHIFT) +#define MMIO_MSI_NUM(x) ((x) & 0x1f) + +/* Flag masks for the AMD IOMMU exclusion range */ +#define MMIO_EXCL_ENABLE_MASK 0x01ULL +#define MMIO_EXCL_ALLOW_MASK 0x02ULL + +/* Used offsets into the MMIO space */ +#define MMIO_DEV_TABLE_OFFSET 0x0000 +#define MMIO_CMD_BUF_OFFSET 0x0008 +#define MMIO_EVT_BUF_OFFSET 0x0010 +#define MMIO_CONTROL_OFFSET 0x0018 +#define MMIO_EXCL_BASE_OFFSET 0x0020 +#define MMIO_EXCL_LIMIT_OFFSET 0x0028 +#define MMIO_CMD_HEAD_OFFSET 0x2000 +#define MMIO_CMD_TAIL_OFFSET 0x2008 +#define MMIO_EVT_HEAD_OFFSET 0x2010 +#define MMIO_EVT_TAIL_OFFSET 0x2018 +#define MMIO_STATUS_OFFSET 0x2020 + +/* MMIO status bits */ +#define MMIO_STATUS_COM_WAIT_INT_MASK 0x04 + +/* event logging constants */ +#define EVENT_ENTRY_SIZE 0x10 +#define EVENT_TYPE_SHIFT 28 +#define EVENT_TYPE_MASK 0xf +#define EVENT_TYPE_ILL_DEV 0x1 +#define EVENT_TYPE_IO_FAULT 0x2 +#define EVENT_TYPE_DEV_TAB_ERR 0x3 +#define EVENT_TYPE_PAGE_TAB_ERR 0x4 +#define EVENT_TYPE_ILL_CMD 0x5 +#define EVENT_TYPE_CMD_HARD_ERR 0x6 +#define EVENT_TYPE_IOTLB_INV_TO 0x7 +#define EVENT_TYPE_INV_DEV_REQ 0x8 +#define EVENT_DEVID_MASK 0xffff +#define EVENT_DEVID_SHIFT 0 +#define EVENT_DOMID_MASK 0xffff +#define EVENT_DOMID_SHIFT 0 +#define EVENT_FLAGS_MASK 0xfff +#define EVENT_FLAGS_SHIFT 0x10 + +/* feature control bits */ +#define CONTROL_IOMMU_EN 0x00ULL +#define CONTROL_HT_TUN_EN 0x01ULL +#define CONTROL_EVT_LOG_EN 0x02ULL +#define CONTROL_EVT_INT_EN 0x03ULL +#define CONTROL_COMWAIT_EN 0x04ULL +#define CONTROL_PASSPW_EN 0x08ULL +#define CONTROL_RESPASSPW_EN 0x09ULL +#define CONTROL_COHERENT_EN 0x0aULL +#define CONTROL_ISOC_EN 0x0bULL +#define CONTROL_CMDBUF_EN 0x0cULL +#define CONTROL_PPFLOG_EN 0x0dULL +#define CONTROL_PPFINT_EN 0x0eULL + +/* command specific defines */ +#define CMD_COMPL_WAIT 0x01 +#define CMD_INV_DEV_ENTRY 0x02 +#define CMD_INV_IOMMU_PAGES 0x03 + +#define CMD_COMPL_WAIT_STORE_MASK 0x01 +#define CMD_COMPL_WAIT_INT_MASK 0x02 +#define CMD_INV_IOMMU_PAGES_SIZE_MASK 0x01 +#define CMD_INV_IOMMU_PAGES_PDE_MASK 0x02 + +#define CMD_INV_IOMMU_ALL_PAGES_ADDRESS 0x7fffffffffffffffULL + +/* macros and definitions for device table entries */ +#define DEV_ENTRY_VALID 0x00 +#define DEV_ENTRY_TRANSLATION 0x01 +#define DEV_ENTRY_IR 0x3d +#define DEV_ENTRY_IW 0x3e +#define DEV_ENTRY_NO_PAGE_FAULT 0x62 +#define DEV_ENTRY_EX 0x67 +#define DEV_ENTRY_SYSMGT1 0x68 +#define DEV_ENTRY_SYSMGT2 0x69 +#define DEV_ENTRY_INIT_PASS 0xb8 +#define DEV_ENTRY_EINT_PASS 0xb9 +#define DEV_ENTRY_NMI_PASS 0xba +#define DEV_ENTRY_LINT0_PASS 0xbe +#define DEV_ENTRY_LINT1_PASS 0xbf +#define DEV_ENTRY_MODE_MASK 0x07 +#define DEV_ENTRY_MODE_SHIFT 0x09 + +/* constants to configure the command buffer */ +#define CMD_BUFFER_SIZE 8192 +#define CMD_BUFFER_ENTRIES 512 +#define MMIO_CMD_SIZE_SHIFT 56 +#define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT) + +/* constants for event buffer handling */ +#define EVT_BUFFER_SIZE 8192 /* 512 entries */ +#define EVT_LEN_MASK (0x9ULL << 56) + +#define PAGE_MODE_1_LEVEL 0x01 +#define PAGE_MODE_2_LEVEL 0x02 +#define PAGE_MODE_3_LEVEL 0x03 + +#define IOMMU_PDE_NL_0 0x000ULL +#define IOMMU_PDE_NL_1 0x200ULL +#define IOMMU_PDE_NL_2 0x400ULL +#define IOMMU_PDE_NL_3 0x600ULL + +#define IOMMU_PTE_L2_INDEX(address) (((address) >> 30) & 0x1ffULL) +#define IOMMU_PTE_L1_INDEX(address) (((address) >> 21) & 0x1ffULL) +#define IOMMU_PTE_L0_INDEX(address) (((address) >> 12) & 0x1ffULL) + +#define IOMMU_MAP_SIZE_L1 (1ULL << 21) +#define IOMMU_MAP_SIZE_L2 (1ULL << 30) +#define IOMMU_MAP_SIZE_L3 (1ULL << 39) + +#define IOMMU_PTE_P (1ULL << 0) +#define IOMMU_PTE_TV (1ULL << 1) +#define IOMMU_PTE_U (1ULL << 59) +#define IOMMU_PTE_FC (1ULL << 60) +#define IOMMU_PTE_IR (1ULL << 61) +#define IOMMU_PTE_IW (1ULL << 62) + +#define IOMMU_L1_PDE(address) \ + ((address) | IOMMU_PDE_NL_1 | IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW) +#define IOMMU_L2_PDE(address) \ + ((address) | IOMMU_PDE_NL_2 | IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW) + +#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) +#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P) +#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK)) +#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07) + +#define IOMMU_PROT_MASK 0x03 +#define IOMMU_PROT_IR 0x01 +#define IOMMU_PROT_IW 0x02 + +/* IOMMU capabilities */ +#define IOMMU_CAP_IOTLB 24 +#define IOMMU_CAP_NPCACHE 26 + +#define MAX_DOMAIN_ID 65536 + +/* FIXME: move this macro to */ +#define PCI_BUS(x) (((x) >> 8) & 0xff) + +/* + * This structure contains generic data for IOMMU protection domains + * independent of their use. + */ +struct protection_domain { + spinlock_t lock; /* mostly used to lock the page table*/ + u16 id; /* the domain id written to the device table */ + int mode; /* paging mode (0-6 levels) */ + u64 *pt_root; /* page table root pointer */ + void *priv; /* private data */ +}; + +/* + * Data container for a dma_ops specific protection domain + */ +struct dma_ops_domain { + struct list_head list; + + /* generic protection domain information */ + struct protection_domain domain; + + /* size of the aperture for the mappings */ + unsigned long aperture_size; + + /* address we start to search for free addresses */ + unsigned long next_bit; + + /* address allocation bitmap */ + unsigned long *bitmap; + + /* + * Array of PTE pages for the aperture. In this array we save all the + * leaf pages of the domain page table used for the aperture. This way + * we don't need to walk the page table to find a specific PTE. We can + * just calculate its address in constant time. + */ + u64 **pte_pages; + + /* This will be set to true when TLB needs to be flushed */ + bool need_flush; + + /* + * if this is a preallocated domain, keep the device for which it was + * preallocated in this variable + */ + u16 target_dev; +}; + +/* + * Structure where we save information about one hardware AMD IOMMU in the + * system. + */ +struct amd_iommu { + struct list_head list; + + /* locks the accesses to the hardware */ + spinlock_t lock; + + /* Pointer to PCI device of this IOMMU */ + struct pci_dev *dev; + + /* + * Capability pointer. There could be more than one IOMMU per PCI + * device function if there are more than one AMD IOMMU capability + * pointers. + */ + u16 cap_ptr; + + /* physical address of MMIO space */ + u64 mmio_phys; + /* virtual address of MMIO space */ + u8 *mmio_base; + + /* capabilities of that IOMMU read from ACPI */ + u32 cap; + + /* pci domain of this IOMMU */ + u16 pci_seg; + + /* first device this IOMMU handles. read from PCI */ + u16 first_device; + /* last device this IOMMU handles. read from PCI */ + u16 last_device; + + /* start of exclusion range of that IOMMU */ + u64 exclusion_start; + /* length of exclusion range of that IOMMU */ + u64 exclusion_length; + + /* command buffer virtual address */ + u8 *cmd_buf; + /* size of command buffer */ + u32 cmd_buf_size; + + /* event buffer virtual address */ + u8 *evt_buf; + /* size of event buffer */ + u32 evt_buf_size; + /* MSI number for event interrupt */ + u16 evt_msi_num; + + /* if one, we need to send a completion wait command */ + int need_sync; + + /* true if interrupts for this IOMMU are already enabled */ + bool int_enabled; + + /* default dma_ops domain for that IOMMU */ + struct dma_ops_domain *default_dom; +}; + +/* + * List with all IOMMUs in the system. This list is not locked because it is + * only written and read at driver initialization or suspend time + */ +extern struct list_head amd_iommu_list; + +/* + * Structure defining one entry in the device table + */ +struct dev_table_entry { + u32 data[8]; +}; + +/* + * One entry for unity mappings parsed out of the ACPI table. + */ +struct unity_map_entry { + struct list_head list; + + /* starting device id this entry is used for (including) */ + u16 devid_start; + /* end device id this entry is used for (including) */ + u16 devid_end; + + /* start address to unity map (including) */ + u64 address_start; + /* end address to unity map (including) */ + u64 address_end; + + /* required protection */ + int prot; +}; + +/* + * List of all unity mappings. It is not locked because as runtime it is only + * read. It is created at ACPI table parsing time. + */ +extern struct list_head amd_iommu_unity_map; + +/* + * Data structures for device handling + */ + +/* + * Device table used by hardware. Read and write accesses by software are + * locked with the amd_iommu_pd_table lock. + */ +extern struct dev_table_entry *amd_iommu_dev_table; + +/* + * Alias table to find requestor ids to device ids. Not locked because only + * read on runtime. + */ +extern u16 *amd_iommu_alias_table; + +/* + * Reverse lookup table to find the IOMMU which translates a specific device. + */ +extern struct amd_iommu **amd_iommu_rlookup_table; + +/* size of the dma_ops aperture as power of 2 */ +extern unsigned amd_iommu_aperture_order; + +/* largest PCI device id we expect translation requests for */ +extern u16 amd_iommu_last_bdf; + +/* data structures for protection domain handling */ +extern struct protection_domain **amd_iommu_pd_table; + +/* allocation bitmap for domain ids */ +extern unsigned long *amd_iommu_pd_alloc_bitmap; + +/* will be 1 if device isolation is enabled */ +extern int amd_iommu_isolate; + +/* + * If true, the addresses will be flushed on unmap time, not when + * they are reused + */ +extern bool amd_iommu_unmap_flush; + +/* takes a PCI device id and prints it out in a readable form */ +static inline void print_devid(u16 devid, int nl) +{ + int bus = devid >> 8; + int dev = devid >> 3 & 0x1f; + int fn = devid & 0x07; + + printk("%02x:%02x.%x", bus, dev, fn); + if (nl) + printk("\n"); +} + +/* takes bus and device/function and returns the device id + * FIXME: should that be in generic PCI code? */ +static inline u16 calc_devid(u8 bus, u8 devfn) +{ + return (((u16)bus) << 8) | devfn; +} + +#endif /* ASM_X86__AMD_IOMMU_TYPES_H */ diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h new file mode 100644 index 00000000000..ef1d72dbdfe --- /dev/null +++ b/arch/x86/include/asm/apic.h @@ -0,0 +1,199 @@ +#ifndef ASM_X86__APIC_H +#define ASM_X86__APIC_H + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#define ARCH_APICTIMER_STOPS_ON_C3 1 + +/* + * Debugging macros + */ +#define APIC_QUIET 0 +#define APIC_VERBOSE 1 +#define APIC_DEBUG 2 + +/* + * Define the default level of output to be very little + * This can be turned up by using apic=verbose for more + * information and apic=debug for _lots_ of information. + * apic_verbosity is defined in apic.c + */ +#define apic_printk(v, s, a...) do { \ + if ((v) <= apic_verbosity) \ + printk(s, ##a); \ + } while (0) + + +extern void generic_apic_probe(void); + +#ifdef CONFIG_X86_LOCAL_APIC + +extern unsigned int apic_verbosity; +extern int local_apic_timer_c2_ok; + +extern int disable_apic; +/* + * Basic functions accessing APICs. + */ +#ifdef CONFIG_PARAVIRT +#include +#else +#define setup_boot_clock setup_boot_APIC_clock +#define setup_secondary_clock setup_secondary_APIC_clock +#endif + +extern int is_vsmp_box(void); +extern void xapic_wait_icr_idle(void); +extern u32 safe_xapic_wait_icr_idle(void); +extern u64 xapic_icr_read(void); +extern void xapic_icr_write(u32, u32); +extern int setup_profiling_timer(unsigned int); + +static inline void native_apic_mem_write(u32 reg, u32 v) +{ + volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg); + + alternative_io("movl %0, %1", "xchgl %0, %1", X86_FEATURE_11AP, + ASM_OUTPUT2("=r" (v), "=m" (*addr)), + ASM_OUTPUT2("0" (v), "m" (*addr))); +} + +static inline u32 native_apic_mem_read(u32 reg) +{ + return *((volatile u32 *)(APIC_BASE + reg)); +} + +static inline void native_apic_msr_write(u32 reg, u32 v) +{ + if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR || + reg == APIC_LVR) + return; + + wrmsr(APIC_BASE_MSR + (reg >> 4), v, 0); +} + +static inline u32 native_apic_msr_read(u32 reg) +{ + u32 low, high; + + if (reg == APIC_DFR) + return -1; + + rdmsr(APIC_BASE_MSR + (reg >> 4), low, high); + return low; +} + +#ifndef CONFIG_X86_32 +extern int x2apic, x2apic_preenabled; +extern void check_x2apic(void); +extern void enable_x2apic(void); +extern void enable_IR_x2apic(void); +extern void x2apic_icr_write(u32 low, u32 id); +static inline int x2apic_enabled(void) +{ + int msr, msr2; + + if (!cpu_has_x2apic) + return 0; + + rdmsr(MSR_IA32_APICBASE, msr, msr2); + if (msr & X2APIC_ENABLE) + return 1; + return 0; +} +#else +#define x2apic_enabled() 0 +#endif + +struct apic_ops { + u32 (*read)(u32 reg); + void (*write)(u32 reg, u32 v); + u64 (*icr_read)(void); + void (*icr_write)(u32 low, u32 high); + void (*wait_icr_idle)(void); + u32 (*safe_wait_icr_idle)(void); +}; + +extern struct apic_ops *apic_ops; + +#define apic_read (apic_ops->read) +#define apic_write (apic_ops->write) +#define apic_icr_read (apic_ops->icr_read) +#define apic_icr_write (apic_ops->icr_write) +#define apic_wait_icr_idle (apic_ops->wait_icr_idle) +#define safe_apic_wait_icr_idle (apic_ops->safe_wait_icr_idle) + +extern int get_physical_broadcast(void); + +#ifdef CONFIG_X86_64 +static inline void ack_x2APIC_irq(void) +{ + /* Docs say use 0 for future compatibility */ + native_apic_msr_write(APIC_EOI, 0); +} +#endif + + +static inline void ack_APIC_irq(void) +{ + /* + * ack_APIC_irq() actually gets compiled as a single instruction + * ... yummie. + */ + + /* Docs say use 0 for future compatibility */ + apic_write(APIC_EOI, 0); +} + +extern int lapic_get_maxlvt(void); +extern void clear_local_APIC(void); +extern void connect_bsp_APIC(void); +extern void disconnect_bsp_APIC(int virt_wire_setup); +extern void disable_local_APIC(void); +extern void lapic_shutdown(void); +extern int verify_local_APIC(void); +extern void cache_APIC_registers(void); +extern void sync_Arb_IDs(void); +extern void init_bsp_APIC(void); +extern void setup_local_APIC(void); +extern void end_local_APIC_setup(void); +extern void init_apic_mappings(void); +extern void setup_boot_APIC_clock(void); +extern void setup_secondary_APIC_clock(void); +extern int APIC_init_uniprocessor(void); +extern void enable_NMI_through_LVT0(void); + +/* + * On 32bit this is mach-xxx local + */ +#ifdef CONFIG_X86_64 +extern void early_init_lapic_mapping(void); +extern int apic_is_clustered_box(void); +#else +static inline int apic_is_clustered_box(void) +{ + return 0; +} +#endif + +extern u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask); +extern u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask); + + +#else /* !CONFIG_X86_LOCAL_APIC */ +static inline void lapic_shutdown(void) { } +#define local_apic_timer_c2_ok 1 +static inline void init_apic_mappings(void) { } + +#endif /* !CONFIG_X86_LOCAL_APIC */ + +#endif /* ASM_X86__APIC_H */ diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h new file mode 100644 index 00000000000..b922c85ac91 --- /dev/null +++ b/arch/x86/include/asm/apicdef.h @@ -0,0 +1,417 @@ +#ifndef ASM_X86__APICDEF_H +#define ASM_X86__APICDEF_H + +/* + * Constants for various Intel APICs. (local APIC, IOAPIC, etc.) + * + * Alan Cox , 1995. + * Ingo Molnar , 1999, 2000 + */ + +#define APIC_DEFAULT_PHYS_BASE 0xfee00000 + +#define APIC_ID 0x20 + +#define APIC_LVR 0x30 +#define APIC_LVR_MASK 0xFF00FF +#define GET_APIC_VERSION(x) ((x) & 0xFFu) +#define GET_APIC_MAXLVT(x) (((x) >> 16) & 0xFFu) +#ifdef CONFIG_X86_32 +# define APIC_INTEGRATED(x) ((x) & 0xF0u) +#else +# define APIC_INTEGRATED(x) (1) +#endif +#define APIC_XAPIC(x) ((x) >= 0x14) +#define APIC_TASKPRI 0x80 +#define APIC_TPRI_MASK 0xFFu +#define APIC_ARBPRI 0x90 +#define APIC_ARBPRI_MASK 0xFFu +#define APIC_PROCPRI 0xA0 +#define APIC_EOI 0xB0 +#define APIC_EIO_ACK 0x0 +#define APIC_RRR 0xC0 +#define APIC_LDR 0xD0 +#define APIC_LDR_MASK (0xFFu << 24) +#define GET_APIC_LOGICAL_ID(x) (((x) >> 24) & 0xFFu) +#define SET_APIC_LOGICAL_ID(x) (((x) << 24)) +#define APIC_ALL_CPUS 0xFFu +#define APIC_DFR 0xE0 +#define APIC_DFR_CLUSTER 0x0FFFFFFFul +#define APIC_DFR_FLAT 0xFFFFFFFFul +#define APIC_SPIV 0xF0 +#define APIC_SPIV_FOCUS_DISABLED (1 << 9) +#define APIC_SPIV_APIC_ENABLED (1 << 8) +#define APIC_ISR 0x100 +#define APIC_ISR_NR 0x8 /* Number of 32 bit ISR registers. */ +#define APIC_TMR 0x180 +#define APIC_IRR 0x200 +#define APIC_ESR 0x280 +#define APIC_ESR_SEND_CS 0x00001 +#define APIC_ESR_RECV_CS 0x00002 +#define APIC_ESR_SEND_ACC 0x00004 +#define APIC_ESR_RECV_ACC 0x00008 +#define APIC_ESR_SENDILL 0x00020 +#define APIC_ESR_RECVILL 0x00040 +#define APIC_ESR_ILLREGA 0x00080 +#define APIC_ICR 0x300 +#define APIC_DEST_SELF 0x40000 +#define APIC_DEST_ALLINC 0x80000 +#define APIC_DEST_ALLBUT 0xC0000 +#define APIC_ICR_RR_MASK 0x30000 +#define APIC_ICR_RR_INVALID 0x00000 +#define APIC_ICR_RR_INPROG 0x10000 +#define APIC_ICR_RR_VALID 0x20000 +#define APIC_INT_LEVELTRIG 0x08000 +#define APIC_INT_ASSERT 0x04000 +#define APIC_ICR_BUSY 0x01000 +#define APIC_DEST_LOGICAL 0x00800 +#define APIC_DEST_PHYSICAL 0x00000 +#define APIC_DM_FIXED 0x00000 +#define APIC_DM_LOWEST 0x00100 +#define APIC_DM_SMI 0x00200 +#define APIC_DM_REMRD 0x00300 +#define APIC_DM_NMI 0x00400 +#define APIC_DM_INIT 0x00500 +#define APIC_DM_STARTUP 0x00600 +#define APIC_DM_EXTINT 0x00700 +#define APIC_VECTOR_MASK 0x000FF +#define APIC_ICR2 0x310 +#define GET_APIC_DEST_FIELD(x) (((x) >> 24) & 0xFF) +#define SET_APIC_DEST_FIELD(x) ((x) << 24) +#define APIC_LVTT 0x320 +#define APIC_LVTTHMR 0x330 +#define APIC_LVTPC 0x340 +#define APIC_LVT0 0x350 +#define APIC_LVT_TIMER_BASE_MASK (0x3 << 18) +#define GET_APIC_TIMER_BASE(x) (((x) >> 18) & 0x3) +#define SET_APIC_TIMER_BASE(x) (((x) << 18)) +#define APIC_TIMER_BASE_CLKIN 0x0 +#define APIC_TIMER_BASE_TMBASE 0x1 +#define APIC_TIMER_BASE_DIV 0x2 +#define APIC_LVT_TIMER_PERIODIC (1 << 17) +#define APIC_LVT_MASKED (1 << 16) +#define APIC_LVT_LEVEL_TRIGGER (1 << 15) +#define APIC_LVT_REMOTE_IRR (1 << 14) +#define APIC_INPUT_POLARITY (1 << 13) +#define APIC_SEND_PENDING (1 << 12) +#define APIC_MODE_MASK 0x700 +#define GET_APIC_DELIVERY_MODE(x) (((x) >> 8) & 0x7) +#define SET_APIC_DELIVERY_MODE(x, y) (((x) & ~0x700) | ((y) << 8)) +#define APIC_MODE_FIXED 0x0 +#define APIC_MODE_NMI 0x4 +#define APIC_MODE_EXTINT 0x7 +#define APIC_LVT1 0x360 +#define APIC_LVTERR 0x370 +#define APIC_TMICT 0x380 +#define APIC_TMCCT 0x390 +#define APIC_TDCR 0x3E0 +#define APIC_SELF_IPI 0x3F0 +#define APIC_TDR_DIV_TMBASE (1 << 2) +#define APIC_TDR_DIV_1 0xB +#define APIC_TDR_DIV_2 0x0 +#define APIC_TDR_DIV_4 0x1 +#define APIC_TDR_DIV_8 0x2 +#define APIC_TDR_DIV_16 0x3 +#define APIC_TDR_DIV_32 0x8 +#define APIC_TDR_DIV_64 0x9 +#define APIC_TDR_DIV_128 0xA +#define APIC_EILVT0 0x500 +#define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */ +#define APIC_EILVT_NR_AMD_10H 4 +#define APIC_EILVT_LVTOFF(x) (((x) >> 4) & 0xF) +#define APIC_EILVT_MSG_FIX 0x0 +#define APIC_EILVT_MSG_SMI 0x2 +#define APIC_EILVT_MSG_NMI 0x4 +#define APIC_EILVT_MSG_EXT 0x7 +#define APIC_EILVT_MASKED (1 << 16) +#define APIC_EILVT1 0x510 +#define APIC_EILVT2 0x520 +#define APIC_EILVT3 0x530 + +#define APIC_BASE (fix_to_virt(FIX_APIC_BASE)) +#define APIC_BASE_MSR 0x800 +#define X2APIC_ENABLE (1UL << 10) + +#ifdef CONFIG_X86_32 +# define MAX_IO_APICS 64 +#else +# define MAX_IO_APICS 128 +# define MAX_LOCAL_APIC 32768 +#endif + +/* + * All x86-64 systems are xAPIC compatible. + * In the following, "apicid" is a physical APIC ID. + */ +#define XAPIC_DEST_CPUS_SHIFT 4 +#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1) +#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT) +#define APIC_CLUSTER(apicid) ((apicid) & XAPIC_DEST_CLUSTER_MASK) +#define APIC_CLUSTERID(apicid) (APIC_CLUSTER(apicid) >> XAPIC_DEST_CPUS_SHIFT) +#define APIC_CPUID(apicid) ((apicid) & XAPIC_DEST_CPUS_MASK) +#define NUM_APIC_CLUSTERS ((BAD_APICID + 1) >> XAPIC_DEST_CPUS_SHIFT) + +/* + * the local APIC register structure, memory mapped. Not terribly well + * tested, but we might eventually use this one in the future - the + * problem why we cannot use it right now is the P5 APIC, it has an + * errata which cannot take 8-bit reads and writes, only 32-bit ones ... + */ +#define u32 unsigned int + +struct local_apic { + +/*000*/ struct { u32 __reserved[4]; } __reserved_01; + +/*010*/ struct { u32 __reserved[4]; } __reserved_02; + +/*020*/ struct { /* APIC ID Register */ + u32 __reserved_1 : 24, + phys_apic_id : 4, + __reserved_2 : 4; + u32 __reserved[3]; + } id; + +/*030*/ const + struct { /* APIC Version Register */ + u32 version : 8, + __reserved_1 : 8, + max_lvt : 8, + __reserved_2 : 8; + u32 __reserved[3]; + } version; + +/*040*/ struct { u32 __reserved[4]; } __reserved_03; + +/*050*/ struct { u32 __reserved[4]; } __reserved_04; + +/*060*/ struct { u32 __reserved[4]; } __reserved_05; + +/*070*/ struct { u32 __reserved[4]; } __reserved_06; + +/*080*/ struct { /* Task Priority Register */ + u32 priority : 8, + __reserved_1 : 24; + u32 __reserved_2[3]; + } tpr; + +/*090*/ const + struct { /* Arbitration Priority Register */ + u32 priority : 8, + __reserved_1 : 24; + u32 __reserved_2[3]; + } apr; + +/*0A0*/ const + struct { /* Processor Priority Register */ + u32 priority : 8, + __reserved_1 : 24; + u32 __reserved_2[3]; + } ppr; + +/*0B0*/ struct { /* End Of Interrupt Register */ + u32 eoi; + u32 __reserved[3]; + } eoi; + +/*0C0*/ struct { u32 __reserved[4]; } __reserved_07; + +/*0D0*/ struct { /* Logical Destination Register */ + u32 __reserved_1 : 24, + logical_dest : 8; + u32 __reserved_2[3]; + } ldr; + +/*0E0*/ struct { /* Destination Format Register */ + u32 __reserved_1 : 28, + model : 4; + u32 __reserved_2[3]; + } dfr; + +/*0F0*/ struct { /* Spurious Interrupt Vector Register */ + u32 spurious_vector : 8, + apic_enabled : 1, + focus_cpu : 1, + __reserved_2 : 22; + u32 __reserved_3[3]; + } svr; + +/*100*/ struct { /* In Service Register */ +/*170*/ u32 bitfield; + u32 __reserved[3]; + } isr [8]; + +/*180*/ struct { /* Trigger Mode Register */ +/*1F0*/ u32 bitfield; + u32 __reserved[3]; + } tmr [8]; + +/*200*/ struct { /* Interrupt Request Register */ +/*270*/ u32 bitfield; + u32 __reserved[3]; + } irr [8]; + +/*280*/ union { /* Error Status Register */ + struct { + u32 send_cs_error : 1, + receive_cs_error : 1, + send_accept_error : 1, + receive_accept_error : 1, + __reserved_1 : 1, + send_illegal_vector : 1, + receive_illegal_vector : 1, + illegal_register_address : 1, + __reserved_2 : 24; + u32 __reserved_3[3]; + } error_bits; + struct { + u32 errors; + u32 __reserved_3[3]; + } all_errors; + } esr; + +/*290*/ struct { u32 __reserved[4]; } __reserved_08; + +/*2A0*/ struct { u32 __reserved[4]; } __reserved_09; + +/*2B0*/ struct { u32 __reserved[4]; } __reserved_10; + +/*2C0*/ struct { u32 __reserved[4]; } __reserved_11; + +/*2D0*/ struct { u32 __reserved[4]; } __reserved_12; + +/*2E0*/ struct { u32 __reserved[4]; } __reserved_13; + +/*2F0*/ struct { u32 __reserved[4]; } __reserved_14; + +/*300*/ struct { /* Interrupt Command Register 1 */ + u32 vector : 8, + delivery_mode : 3, + destination_mode : 1, + delivery_status : 1, + __reserved_1 : 1, + level : 1, + trigger : 1, + __reserved_2 : 2, + shorthand : 2, + __reserved_3 : 12; + u32 __reserved_4[3]; + } icr1; + +/*310*/ struct { /* Interrupt Command Register 2 */ + union { + u32 __reserved_1 : 24, + phys_dest : 4, + __reserved_2 : 4; + u32 __reserved_3 : 24, + logical_dest : 8; + } dest; + u32 __reserved_4[3]; + } icr2; + +/*320*/ struct { /* LVT - Timer */ + u32 vector : 8, + __reserved_1 : 4, + delivery_status : 1, + __reserved_2 : 3, + mask : 1, + timer_mode : 1, + __reserved_3 : 14; + u32 __reserved_4[3]; + } lvt_timer; + +/*330*/ struct { /* LVT - Thermal Sensor */ + u32 vector : 8, + delivery_mode : 3, + __reserved_1 : 1, + delivery_status : 1, + __reserved_2 : 3, + mask : 1, + __reserved_3 : 15; + u32 __reserved_4[3]; + } lvt_thermal; + +/*340*/ struct { /* LVT - Performance Counter */ + u32 vector : 8, + delivery_mode : 3, + __reserved_1 : 1, + delivery_status : 1, + __reserved_2 : 3, + mask : 1, + __reserved_3 : 15; + u32 __reserved_4[3]; + } lvt_pc; + +/*350*/ struct { /* LVT - LINT0 */ + u32 vector : 8, + delivery_mode : 3, + __reserved_1 : 1, + delivery_status : 1, + polarity : 1, + remote_irr : 1, + trigger : 1, + mask : 1, + __reserved_2 : 15; + u32 __reserved_3[3]; + } lvt_lint0; + +/*360*/ struct { /* LVT - LINT1 */ + u32 vector : 8, + delivery_mode : 3, + __reserved_1 : 1, + delivery_status : 1, + polarity : 1, + remote_irr : 1, + trigger : 1, + mask : 1, + __reserved_2 : 15; + u32 __reserved_3[3]; + } lvt_lint1; + +/*370*/ struct { /* LVT - Error */ + u32 vector : 8, + __reserved_1 : 4, + delivery_status : 1, + __reserved_2 : 3, + mask : 1, + __reserved_3 : 15; + u32 __reserved_4[3]; + } lvt_error; + +/*380*/ struct { /* Timer Initial Count Register */ + u32 initial_count; + u32 __reserved_2[3]; + } timer_icr; + +/*390*/ const + struct { /* Timer Current Count Register */ + u32 curr_count; + u32 __reserved_2[3]; + } timer_ccr; + +/*3A0*/ struct { u32 __reserved[4]; } __reserved_16; + +/*3B0*/ struct { u32 __reserved[4]; } __reserved_17; + +/*3C0*/ struct { u32 __reserved[4]; } __reserved_18; + +/*3D0*/ struct { u32 __reserved[4]; } __reserved_19; + +/*3E0*/ struct { /* Timer Divide Configuration Register */ + u32 divisor : 4, + __reserved_1 : 28; + u32 __reserved_2[3]; + } timer_dcr; + +/*3F0*/ struct { u32 __reserved[4]; } __reserved_20; + +} __attribute__ ((packed)); + +#undef u32 + +#ifdef CONFIG_X86_32 + #define BAD_APICID 0xFFu +#else + #define BAD_APICID 0xFFFFu +#endif +#endif /* ASM_X86__APICDEF_H */ diff --git a/arch/x86/include/asm/arch_hooks.h b/arch/x86/include/asm/arch_hooks.h new file mode 100644 index 00000000000..de4596b24c2 --- /dev/null +++ b/arch/x86/include/asm/arch_hooks.h @@ -0,0 +1,26 @@ +#ifndef ASM_X86__ARCH_HOOKS_H +#define ASM_X86__ARCH_HOOKS_H + +#include + +/* + * linux/include/asm/arch_hooks.h + * + * define the architecture specific hooks + */ + +/* these aren't arch hooks, they are generic routines + * that can be used by the hooks */ +extern void init_ISA_irqs(void); +extern irqreturn_t timer_interrupt(int irq, void *dev_id); + +/* these are the defined hooks */ +extern void intr_init_hook(void); +extern void pre_intr_init_hook(void); +extern void pre_setup_arch_hook(void); +extern void trap_init_hook(void); +extern void pre_time_init_hook(void); +extern void time_init_hook(void); +extern void mca_nmi_hook(void); + +#endif /* ASM_X86__ARCH_HOOKS_H */ diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h new file mode 100644 index 00000000000..e1355f44d7c --- /dev/null +++ b/arch/x86/include/asm/asm.h @@ -0,0 +1,47 @@ +#ifndef ASM_X86__ASM_H +#define ASM_X86__ASM_H + +#ifdef __ASSEMBLY__ +# define __ASM_FORM(x) x +# define __ASM_EX_SEC .section __ex_table +#else +# define __ASM_FORM(x) " " #x " " +# define __ASM_EX_SEC " .section __ex_table,\"a\"\n" +#endif + +#ifdef CONFIG_X86_32 +# define __ASM_SEL(a,b) __ASM_FORM(a) +#else +# define __ASM_SEL(a,b) __ASM_FORM(b) +#endif + +#define __ASM_SIZE(inst) __ASM_SEL(inst##l, inst##q) +#define __ASM_REG(reg) __ASM_SEL(e##reg, r##reg) + +#define _ASM_PTR __ASM_SEL(.long, .quad) +#define _ASM_ALIGN __ASM_SEL(.balign 4, .balign 8) + +#define _ASM_MOV __ASM_SIZE(mov) +#define _ASM_INC __ASM_SIZE(inc) +#define _ASM_DEC __ASM_SIZE(dec) +#define _ASM_ADD __ASM_SIZE(add) +#define _ASM_SUB __ASM_SIZE(sub) +#define _ASM_XADD __ASM_SIZE(xadd) + +#define _ASM_AX __ASM_REG(ax) +#define _ASM_BX __ASM_REG(bx) +#define _ASM_CX __ASM_REG(cx) +#define _ASM_DX __ASM_REG(dx) +#define _ASM_SP __ASM_REG(sp) +#define _ASM_BP __ASM_REG(bp) +#define _ASM_SI __ASM_REG(si) +#define _ASM_DI __ASM_REG(di) + +/* Exception table entry */ +# define _ASM_EXTABLE(from,to) \ + __ASM_EX_SEC \ + _ASM_ALIGN "\n" \ + _ASM_PTR #from "," #to "\n" \ + " .previous\n" + +#endif /* ASM_X86__ASM_H */ diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h new file mode 100644 index 00000000000..4e1b8873c47 --- /dev/null +++ b/arch/x86/include/asm/atomic.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "atomic_32.h" +#else +# include "atomic_64.h" +#endif diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h new file mode 100644 index 00000000000..14d3f0beb88 --- /dev/null +++ b/arch/x86/include/asm/atomic_32.h @@ -0,0 +1,259 @@ +#ifndef ASM_X86__ATOMIC_32_H +#define ASM_X86__ATOMIC_32_H + +#include +#include +#include + +/* + * Atomic operations that C can't guarantee us. Useful for + * resource counting etc.. + */ + +/* + * Make sure gcc doesn't try to be clever and move things around + * on us. We need to use _exactly_ the address the user gave us, + * not some alias that contains the same information. + */ +typedef struct { + int counter; +} atomic_t; + +#define ATOMIC_INIT(i) { (i) } + +/** + * atomic_read - read atomic variable + * @v: pointer of type atomic_t + * + * Atomically reads the value of @v. + */ +#define atomic_read(v) ((v)->counter) + +/** + * atomic_set - set atomic variable + * @v: pointer of type atomic_t + * @i: required value + * + * Atomically sets the value of @v to @i. + */ +#define atomic_set(v, i) (((v)->counter) = (i)) + +/** + * atomic_add - add integer to atomic variable + * @i: integer value to add + * @v: pointer of type atomic_t + * + * Atomically adds @i to @v. + */ +static inline void atomic_add(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "addl %1,%0" + : "+m" (v->counter) + : "ir" (i)); +} + +/** + * atomic_sub - subtract integer from atomic variable + * @i: integer value to subtract + * @v: pointer of type atomic_t + * + * Atomically subtracts @i from @v. + */ +static inline void atomic_sub(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "subl %1,%0" + : "+m" (v->counter) + : "ir" (i)); +} + +/** + * atomic_sub_and_test - subtract value from variable and test result + * @i: integer value to subtract + * @v: pointer of type atomic_t + * + * Atomically subtracts @i from @v and returns + * true if the result is zero, or false for all + * other cases. + */ +static inline int atomic_sub_and_test(int i, atomic_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "subl %2,%0; sete %1" + : "+m" (v->counter), "=qm" (c) + : "ir" (i) : "memory"); + return c; +} + +/** + * atomic_inc - increment atomic variable + * @v: pointer of type atomic_t + * + * Atomically increments @v by 1. + */ +static inline void atomic_inc(atomic_t *v) +{ + asm volatile(LOCK_PREFIX "incl %0" + : "+m" (v->counter)); +} + +/** + * atomic_dec - decrement atomic variable + * @v: pointer of type atomic_t + * + * Atomically decrements @v by 1. + */ +static inline void atomic_dec(atomic_t *v) +{ + asm volatile(LOCK_PREFIX "decl %0" + : "+m" (v->counter)); +} + +/** + * atomic_dec_and_test - decrement and test + * @v: pointer of type atomic_t + * + * Atomically decrements @v by 1 and + * returns true if the result is 0, or false for all other + * cases. + */ +static inline int atomic_dec_and_test(atomic_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "decl %0; sete %1" + : "+m" (v->counter), "=qm" (c) + : : "memory"); + return c != 0; +} + +/** + * atomic_inc_and_test - increment and test + * @v: pointer of type atomic_t + * + * Atomically increments @v by 1 + * and returns true if the result is zero, or false for all + * other cases. + */ +static inline int atomic_inc_and_test(atomic_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "incl %0; sete %1" + : "+m" (v->counter), "=qm" (c) + : : "memory"); + return c != 0; +} + +/** + * atomic_add_negative - add and test if negative + * @v: pointer of type atomic_t + * @i: integer value to add + * + * Atomically adds @i to @v and returns true + * if the result is negative, or false when + * result is greater than or equal to zero. + */ +static inline int atomic_add_negative(int i, atomic_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "addl %2,%0; sets %1" + : "+m" (v->counter), "=qm" (c) + : "ir" (i) : "memory"); + return c; +} + +/** + * atomic_add_return - add integer and return + * @v: pointer of type atomic_t + * @i: integer value to add + * + * Atomically adds @i to @v and returns @i + @v + */ +static inline int atomic_add_return(int i, atomic_t *v) +{ + int __i; +#ifdef CONFIG_M386 + unsigned long flags; + if (unlikely(boot_cpu_data.x86 <= 3)) + goto no_xadd; +#endif + /* Modern 486+ processor */ + __i = i; + asm volatile(LOCK_PREFIX "xaddl %0, %1" + : "+r" (i), "+m" (v->counter) + : : "memory"); + return i + __i; + +#ifdef CONFIG_M386 +no_xadd: /* Legacy 386 processor */ + local_irq_save(flags); + __i = atomic_read(v); + atomic_set(v, i + __i); + local_irq_restore(flags); + return i + __i; +#endif +} + +/** + * atomic_sub_return - subtract integer and return + * @v: pointer of type atomic_t + * @i: integer value to subtract + * + * Atomically subtracts @i from @v and returns @v - @i + */ +static inline int atomic_sub_return(int i, atomic_t *v) +{ + return atomic_add_return(-i, v); +} + +#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) +#define atomic_xchg(v, new) (xchg(&((v)->counter), (new))) + +/** + * atomic_add_unless - add unless the number is already a given value + * @v: pointer of type atomic_t + * @a: the amount to add to v... + * @u: ...unless v is equal to u. + * + * Atomically adds @a to @v, so long as @v was not already @u. + * Returns non-zero if @v was not @u, and zero otherwise. + */ +static inline int atomic_add_unless(atomic_t *v, int a, int u) +{ + int c, old; + c = atomic_read(v); + for (;;) { + if (unlikely(c == (u))) + break; + old = atomic_cmpxchg((v), c, c + (a)); + if (likely(old == c)) + break; + c = old; + } + return c != (u); +} + +#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) + +#define atomic_inc_return(v) (atomic_add_return(1, v)) +#define atomic_dec_return(v) (atomic_sub_return(1, v)) + +/* These are x86-specific, used by some header files */ +#define atomic_clear_mask(mask, addr) \ + asm volatile(LOCK_PREFIX "andl %0,%1" \ + : : "r" (~(mask)), "m" (*(addr)) : "memory") + +#define atomic_set_mask(mask, addr) \ + asm volatile(LOCK_PREFIX "orl %0,%1" \ + : : "r" (mask), "m" (*(addr)) : "memory") + +/* Atomic operations are already serializing on x86 */ +#define smp_mb__before_atomic_dec() barrier() +#define smp_mb__after_atomic_dec() barrier() +#define smp_mb__before_atomic_inc() barrier() +#define smp_mb__after_atomic_inc() barrier() + +#include +#endif /* ASM_X86__ATOMIC_32_H */ diff --git a/arch/x86/include/asm/atomic_64.h b/arch/x86/include/asm/atomic_64.h new file mode 100644 index 00000000000..2cb218c4a35 --- /dev/null +++ b/arch/x86/include/asm/atomic_64.h @@ -0,0 +1,473 @@ +#ifndef ASM_X86__ATOMIC_64_H +#define ASM_X86__ATOMIC_64_H + +#include +#include + +/* atomic_t should be 32 bit signed type */ + +/* + * Atomic operations that C can't guarantee us. Useful for + * resource counting etc.. + */ + +/* + * Make sure gcc doesn't try to be clever and move things around + * on us. We need to use _exactly_ the address the user gave us, + * not some alias that contains the same information. + */ +typedef struct { + int counter; +} atomic_t; + +#define ATOMIC_INIT(i) { (i) } + +/** + * atomic_read - read atomic variable + * @v: pointer of type atomic_t + * + * Atomically reads the value of @v. + */ +#define atomic_read(v) ((v)->counter) + +/** + * atomic_set - set atomic variable + * @v: pointer of type atomic_t + * @i: required value + * + * Atomically sets the value of @v to @i. + */ +#define atomic_set(v, i) (((v)->counter) = (i)) + +/** + * atomic_add - add integer to atomic variable + * @i: integer value to add + * @v: pointer of type atomic_t + * + * Atomically adds @i to @v. + */ +static inline void atomic_add(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "addl %1,%0" + : "=m" (v->counter) + : "ir" (i), "m" (v->counter)); +} + +/** + * atomic_sub - subtract the atomic variable + * @i: integer value to subtract + * @v: pointer of type atomic_t + * + * Atomically subtracts @i from @v. + */ +static inline void atomic_sub(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "subl %1,%0" + : "=m" (v->counter) + : "ir" (i), "m" (v->counter)); +} + +/** + * atomic_sub_and_test - subtract value from variable and test result + * @i: integer value to subtract + * @v: pointer of type atomic_t + * + * Atomically subtracts @i from @v and returns + * true if the result is zero, or false for all + * other cases. + */ +static inline int atomic_sub_and_test(int i, atomic_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "subl %2,%0; sete %1" + : "=m" (v->counter), "=qm" (c) + : "ir" (i), "m" (v->counter) : "memory"); + return c; +} + +/** + * atomic_inc - increment atomic variable + * @v: pointer of type atomic_t + * + * Atomically increments @v by 1. + */ +static inline void atomic_inc(atomic_t *v) +{ + asm volatile(LOCK_PREFIX "incl %0" + : "=m" (v->counter) + : "m" (v->counter)); +} + +/** + * atomic_dec - decrement atomic variable + * @v: pointer of type atomic_t + * + * Atomically decrements @v by 1. + */ +static inline void atomic_dec(atomic_t *v) +{ + asm volatile(LOCK_PREFIX "decl %0" + : "=m" (v->counter) + : "m" (v->counter)); +} + +/** + * atomic_dec_and_test - decrement and test + * @v: pointer of type atomic_t + * + * Atomically decrements @v by 1 and + * returns true if the result is 0, or false for all other + * cases. + */ +static inline int atomic_dec_and_test(atomic_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "decl %0; sete %1" + : "=m" (v->counter), "=qm" (c) + : "m" (v->counter) : "memory"); + return c != 0; +} + +/** + * atomic_inc_and_test - increment and test + * @v: pointer of type atomic_t + * + * Atomically increments @v by 1 + * and returns true if the result is zero, or false for all + * other cases. + */ +static inline int atomic_inc_and_test(atomic_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "incl %0; sete %1" + : "=m" (v->counter), "=qm" (c) + : "m" (v->counter) : "memory"); + return c != 0; +} + +/** + * atomic_add_negative - add and test if negative + * @i: integer value to add + * @v: pointer of type atomic_t + * + * Atomically adds @i to @v and returns true + * if the result is negative, or false when + * result is greater than or equal to zero. + */ +static inline int atomic_add_negative(int i, atomic_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "addl %2,%0; sets %1" + : "=m" (v->counter), "=qm" (c) + : "ir" (i), "m" (v->counter) : "memory"); + return c; +} + +/** + * atomic_add_return - add and return + * @i: integer value to add + * @v: pointer of type atomic_t + * + * Atomically adds @i to @v and returns @i + @v + */ +static inline int atomic_add_return(int i, atomic_t *v) +{ + int __i = i; + asm volatile(LOCK_PREFIX "xaddl %0, %1" + : "+r" (i), "+m" (v->counter) + : : "memory"); + return i + __i; +} + +static inline int atomic_sub_return(int i, atomic_t *v) +{ + return atomic_add_return(-i, v); +} + +#define atomic_inc_return(v) (atomic_add_return(1, v)) +#define atomic_dec_return(v) (atomic_sub_return(1, v)) + +/* An 64bit atomic type */ + +typedef struct { + long counter; +} atomic64_t; + +#define ATOMIC64_INIT(i) { (i) } + +/** + * atomic64_read - read atomic64 variable + * @v: pointer of type atomic64_t + * + * Atomically reads the value of @v. + * Doesn't imply a read memory barrier. + */ +#define atomic64_read(v) ((v)->counter) + +/** + * atomic64_set - set atomic64 variable + * @v: pointer to type atomic64_t + * @i: required value + * + * Atomically sets the value of @v to @i. + */ +#define atomic64_set(v, i) (((v)->counter) = (i)) + +/** + * atomic64_add - add integer to atomic64 variable + * @i: integer value to add + * @v: pointer to type atomic64_t + * + * Atomically adds @i to @v. + */ +static inline void atomic64_add(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "addq %1,%0" + : "=m" (v->counter) + : "er" (i), "m" (v->counter)); +} + +/** + * atomic64_sub - subtract the atomic64 variable + * @i: integer value to subtract + * @v: pointer to type atomic64_t + * + * Atomically subtracts @i from @v. + */ +static inline void atomic64_sub(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "subq %1,%0" + : "=m" (v->counter) + : "er" (i), "m" (v->counter)); +} + +/** + * atomic64_sub_and_test - subtract value from variable and test result + * @i: integer value to subtract + * @v: pointer to type atomic64_t + * + * Atomically subtracts @i from @v and returns + * true if the result is zero, or false for all + * other cases. + */ +static inline int atomic64_sub_and_test(long i, atomic64_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "subq %2,%0; sete %1" + : "=m" (v->counter), "=qm" (c) + : "er" (i), "m" (v->counter) : "memory"); + return c; +} + +/** + * atomic64_inc - increment atomic64 variable + * @v: pointer to type atomic64_t + * + * Atomically increments @v by 1. + */ +static inline void atomic64_inc(atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "incq %0" + : "=m" (v->counter) + : "m" (v->counter)); +} + +/** + * atomic64_dec - decrement atomic64 variable + * @v: pointer to type atomic64_t + * + * Atomically decrements @v by 1. + */ +static inline void atomic64_dec(atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "decq %0" + : "=m" (v->counter) + : "m" (v->counter)); +} + +/** + * atomic64_dec_and_test - decrement and test + * @v: pointer to type atomic64_t + * + * Atomically decrements @v by 1 and + * returns true if the result is 0, or false for all other + * cases. + */ +static inline int atomic64_dec_and_test(atomic64_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "decq %0; sete %1" + : "=m" (v->counter), "=qm" (c) + : "m" (v->counter) : "memory"); + return c != 0; +} + +/** + * atomic64_inc_and_test - increment and test + * @v: pointer to type atomic64_t + * + * Atomically increments @v by 1 + * and returns true if the result is zero, or false for all + * other cases. + */ +static inline int atomic64_inc_and_test(atomic64_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "incq %0; sete %1" + : "=m" (v->counter), "=qm" (c) + : "m" (v->counter) : "memory"); + return c != 0; +} + +/** + * atomic64_add_negative - add and test if negative + * @i: integer value to add + * @v: pointer to type atomic64_t + * + * Atomically adds @i to @v and returns true + * if the result is negative, or false when + * result is greater than or equal to zero. + */ +static inline int atomic64_add_negative(long i, atomic64_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "addq %2,%0; sets %1" + : "=m" (v->counter), "=qm" (c) + : "er" (i), "m" (v->counter) : "memory"); + return c; +} + +/** + * atomic64_add_return - add and return + * @i: integer value to add + * @v: pointer to type atomic64_t + * + * Atomically adds @i to @v and returns @i + @v + */ +static inline long atomic64_add_return(long i, atomic64_t *v) +{ + long __i = i; + asm volatile(LOCK_PREFIX "xaddq %0, %1;" + : "+r" (i), "+m" (v->counter) + : : "memory"); + return i + __i; +} + +static inline long atomic64_sub_return(long i, atomic64_t *v) +{ + return atomic64_add_return(-i, v); +} + +#define atomic64_inc_return(v) (atomic64_add_return(1, (v))) +#define atomic64_dec_return(v) (atomic64_sub_return(1, (v))) + +#define atomic64_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) +#define atomic64_xchg(v, new) (xchg(&((v)->counter), new)) + +#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) +#define atomic_xchg(v, new) (xchg(&((v)->counter), (new))) + +/** + * atomic_add_unless - add unless the number is a given value + * @v: pointer of type atomic_t + * @a: the amount to add to v... + * @u: ...unless v is equal to u. + * + * Atomically adds @a to @v, so long as it was not @u. + * Returns non-zero if @v was not @u, and zero otherwise. + */ +static inline int atomic_add_unless(atomic_t *v, int a, int u) +{ + int c, old; + c = atomic_read(v); + for (;;) { + if (unlikely(c == (u))) + break; + old = atomic_cmpxchg((v), c, c + (a)); + if (likely(old == c)) + break; + c = old; + } + return c != (u); +} + +#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) + +/** + * atomic64_add_unless - add unless the number is a given value + * @v: pointer of type atomic64_t + * @a: the amount to add to v... + * @u: ...unless v is equal to u. + * + * Atomically adds @a to @v, so long as it was not @u. + * Returns non-zero if @v was not @u, and zero otherwise. + */ +static inline int atomic64_add_unless(atomic64_t *v, long a, long u) +{ + long c, old; + c = atomic64_read(v); + for (;;) { + if (unlikely(c == (u))) + break; + old = atomic64_cmpxchg((v), c, c + (a)); + if (likely(old == c)) + break; + c = old; + } + return c != (u); +} + +/** + * atomic_inc_short - increment of a short integer + * @v: pointer to type int + * + * Atomically adds 1 to @v + * Returns the new value of @u + */ +static inline short int atomic_inc_short(short int *v) +{ + asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v)); + return *v; +} + +/** + * atomic_or_long - OR of two long integers + * @v1: pointer to type unsigned long + * @v2: pointer to type unsigned long + * + * Atomically ORs @v1 and @v2 + * Returns the result of the OR + */ +static inline void atomic_or_long(unsigned long *v1, unsigned long v2) +{ + asm(LOCK_PREFIX "orq %1, %0" : "+m" (*v1) : "r" (v2)); +} + +#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0) + +/* These are x86-specific, used by some header files */ +#define atomic_clear_mask(mask, addr) \ + asm volatile(LOCK_PREFIX "andl %0,%1" \ + : : "r" (~(mask)), "m" (*(addr)) : "memory") + +#define atomic_set_mask(mask, addr) \ + asm volatile(LOCK_PREFIX "orl %0,%1" \ + : : "r" ((unsigned)(mask)), "m" (*(addr)) \ + : "memory") + +/* Atomic operations are already serializing on x86 */ +#define smp_mb__before_atomic_dec() barrier() +#define smp_mb__after_atomic_dec() barrier() +#define smp_mb__before_atomic_inc() barrier() +#define smp_mb__after_atomic_inc() barrier() + +#include +#endif /* ASM_X86__ATOMIC_64_H */ diff --git a/arch/x86/include/asm/auxvec.h b/arch/x86/include/asm/auxvec.h new file mode 100644 index 00000000000..12c7cac7420 --- /dev/null +++ b/arch/x86/include/asm/auxvec.h @@ -0,0 +1,12 @@ +#ifndef ASM_X86__AUXVEC_H +#define ASM_X86__AUXVEC_H +/* + * Architecture-neutral AT_ values in 0-17, leave some room + * for more of them, start the x86-specific ones at 32. + */ +#ifdef __i386__ +#define AT_SYSINFO 32 +#endif +#define AT_SYSINFO_EHDR 33 + +#endif /* ASM_X86__AUXVEC_H */ diff --git a/arch/x86/include/asm/bigsmp/apic.h b/arch/x86/include/asm/bigsmp/apic.h new file mode 100644 index 00000000000..1d9543b9d35 --- /dev/null +++ b/arch/x86/include/asm/bigsmp/apic.h @@ -0,0 +1,139 @@ +#ifndef __ASM_MACH_APIC_H +#define __ASM_MACH_APIC_H + +#define xapic_phys_to_log_apicid(cpu) (per_cpu(x86_bios_cpu_apicid, cpu)) +#define esr_disable (1) + +static inline int apic_id_registered(void) +{ + return (1); +} + +static inline cpumask_t target_cpus(void) +{ +#ifdef CONFIG_SMP + return cpu_online_map; +#else + return cpumask_of_cpu(0); +#endif +} + +#undef APIC_DEST_LOGICAL +#define APIC_DEST_LOGICAL 0 +#define APIC_DFR_VALUE (APIC_DFR_FLAT) +#define INT_DELIVERY_MODE (dest_Fixed) +#define INT_DEST_MODE (0) /* phys delivery to target proc */ +#define NO_BALANCE_IRQ (0) +#define WAKE_SECONDARY_VIA_INIT + + +static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) +{ + return (0); +} + +static inline unsigned long check_apicid_present(int bit) +{ + return (1); +} + +static inline unsigned long calculate_ldr(int cpu) +{ + unsigned long val, id; + val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; + id = xapic_phys_to_log_apicid(cpu); + val |= SET_APIC_LOGICAL_ID(id); + return val; +} + +/* + * Set up the logical destination ID. + * + * Intel recommends to set DFR, LDR and TPR before enabling + * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel + * document number 292116). So here it goes... + */ +static inline void init_apic_ldr(void) +{ + unsigned long val; + int cpu = smp_processor_id(); + + apic_write(APIC_DFR, APIC_DFR_VALUE); + val = calculate_ldr(cpu); + apic_write(APIC_LDR, val); +} + +static inline void setup_apic_routing(void) +{ + printk("Enabling APIC mode: %s. Using %d I/O APICs\n", + "Physflat", nr_ioapics); +} + +static inline int multi_timer_check(int apic, int irq) +{ + return (0); +} + +static inline int apicid_to_node(int logical_apicid) +{ + return apicid_2_node[hard_smp_processor_id()]; +} + +static inline int cpu_present_to_apicid(int mps_cpu) +{ + if (mps_cpu < NR_CPUS) + return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu); + + return BAD_APICID; +} + +static inline physid_mask_t apicid_to_cpu_present(int phys_apicid) +{ + return physid_mask_of_physid(phys_apicid); +} + +extern u8 cpu_2_logical_apicid[]; +/* Mapping from cpu number to logical apicid */ +static inline int cpu_to_logical_apicid(int cpu) +{ + if (cpu >= NR_CPUS) + return BAD_APICID; + return cpu_physical_id(cpu); +} + +static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map) +{ + /* For clustered we don't have a good way to do this yet - hack */ + return physids_promote(0xFFL); +} + +static inline void setup_portio_remap(void) +{ +} + +static inline void enable_apic_mode(void) +{ +} + +static inline int check_phys_apicid_present(int boot_cpu_physical_apicid) +{ + return (1); +} + +/* As we are using single CPU as destination, pick only one CPU here */ +static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) +{ + int cpu; + int apicid; + + cpu = first_cpu(cpumask); + apicid = cpu_to_logical_apicid(cpu); + return apicid; +} + +static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb) +{ + return cpuid_apic >> index_msb; +} + +#endif /* __ASM_MACH_APIC_H */ diff --git a/arch/x86/include/asm/bigsmp/apicdef.h b/arch/x86/include/asm/bigsmp/apicdef.h new file mode 100644 index 00000000000..392c3f5ef2f --- /dev/null +++ b/arch/x86/include/asm/bigsmp/apicdef.h @@ -0,0 +1,13 @@ +#ifndef __ASM_MACH_APICDEF_H +#define __ASM_MACH_APICDEF_H + +#define APIC_ID_MASK (0xFF<<24) + +static inline unsigned get_apic_id(unsigned long x) +{ + return (((x)>>24)&0xFF); +} + +#define GET_APIC_ID(x) get_apic_id(x) + +#endif diff --git a/arch/x86/include/asm/bigsmp/ipi.h b/arch/x86/include/asm/bigsmp/ipi.h new file mode 100644 index 00000000000..9404c535b7e --- /dev/null +++ b/arch/x86/include/asm/bigsmp/ipi.h @@ -0,0 +1,25 @@ +#ifndef __ASM_MACH_IPI_H +#define __ASM_MACH_IPI_H + +void send_IPI_mask_sequence(cpumask_t mask, int vector); + +static inline void send_IPI_mask(cpumask_t mask, int vector) +{ + send_IPI_mask_sequence(mask, vector); +} + +static inline void send_IPI_allbutself(int vector) +{ + cpumask_t mask = cpu_online_map; + cpu_clear(smp_processor_id(), mask); + + if (!cpus_empty(mask)) + send_IPI_mask(mask, vector); +} + +static inline void send_IPI_all(int vector) +{ + send_IPI_mask(cpu_online_map, vector); +} + +#endif /* __ASM_MACH_IPI_H */ diff --git a/arch/x86/include/asm/bios_ebda.h b/arch/x86/include/asm/bios_ebda.h new file mode 100644 index 00000000000..79b4b88505d --- /dev/null +++ b/arch/x86/include/asm/bios_ebda.h @@ -0,0 +1,36 @@ +#ifndef ASM_X86__BIOS_EBDA_H +#define ASM_X86__BIOS_EBDA_H + +#include + +/* + * there is a real-mode segmented pointer pointing to the + * 4K EBDA area at 0x40E. + */ +static inline unsigned int get_bios_ebda(void) +{ + unsigned int address = *(unsigned short *)phys_to_virt(0x40E); + address <<= 4; + return address; /* 0 means none */ +} + +void reserve_ebda_region(void); + +#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION +/* + * This is obviously not a great place for this, but we want to be + * able to scatter it around anywhere in the kernel. + */ +void check_for_bios_corruption(void); +void start_periodic_check_for_corruption(void); +#else +static inline void check_for_bios_corruption(void) +{ +} + +static inline void start_periodic_check_for_corruption(void) +{ +} +#endif + +#endif /* ASM_X86__BIOS_EBDA_H */ diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h new file mode 100644 index 00000000000..451a74762bd --- /dev/null +++ b/arch/x86/include/asm/bitops.h @@ -0,0 +1,451 @@ +#ifndef ASM_X86__BITOPS_H +#define ASM_X86__BITOPS_H + +/* + * Copyright 1992, Linus Torvalds. + */ + +#ifndef _LINUX_BITOPS_H +#error only can be included directly +#endif + +#include +#include + +/* + * These have to be done with inline assembly: that way the bit-setting + * is guaranteed to be atomic. All bit operations return 0 if the bit + * was cleared before the operation and != 0 if it was not. + * + * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1). + */ + +#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1) +/* Technically wrong, but this avoids compilation errors on some gcc + versions. */ +#define BITOP_ADDR(x) "=m" (*(volatile long *) (x)) +#else +#define BITOP_ADDR(x) "+m" (*(volatile long *) (x)) +#endif + +#define ADDR BITOP_ADDR(addr) + +/* + * We do the locked ops that don't return the old value as + * a mask operation on a byte. + */ +#define IS_IMMEDIATE(nr) (__builtin_constant_p(nr)) +#define CONST_MASK_ADDR(nr, addr) BITOP_ADDR((void *)(addr) + ((nr)>>3)) +#define CONST_MASK(nr) (1 << ((nr) & 7)) + +/** + * set_bit - Atomically set a bit in memory + * @nr: the bit to set + * @addr: the address to start counting from + * + * This function is atomic and may not be reordered. See __set_bit() + * if you do not require the atomic guarantees. + * + * Note: there are no guarantees that this function will not be reordered + * on non x86 architectures, so if you are writing portable code, + * make sure not to rely on its reordering guarantees. + * + * Note that @nr may be almost arbitrarily large; this function is not + * restricted to acting on a single-word quantity. + */ +static inline void set_bit(unsigned int nr, volatile unsigned long *addr) +{ + if (IS_IMMEDIATE(nr)) { + asm volatile(LOCK_PREFIX "orb %1,%0" + : CONST_MASK_ADDR(nr, addr) + : "iq" ((u8)CONST_MASK(nr)) + : "memory"); + } else { + asm volatile(LOCK_PREFIX "bts %1,%0" + : BITOP_ADDR(addr) : "Ir" (nr) : "memory"); + } +} + +/** + * __set_bit - Set a bit in memory + * @nr: the bit to set + * @addr: the address to start counting from + * + * Unlike set_bit(), this function is non-atomic and may be reordered. + * If it's called on the same region of memory simultaneously, the effect + * may be that only one operation succeeds. + */ +static inline void __set_bit(int nr, volatile unsigned long *addr) +{ + asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory"); +} + +/** + * clear_bit - Clears a bit in memory + * @nr: Bit to clear + * @addr: Address to start counting from + * + * clear_bit() is atomic and may not be reordered. However, it does + * not contain a memory barrier, so if it is used for locking purposes, + * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit() + * in order to ensure changes are visible on other processors. + */ +static inline void clear_bit(int nr, volatile unsigned long *addr) +{ + if (IS_IMMEDIATE(nr)) { + asm volatile(LOCK_PREFIX "andb %1,%0" + : CONST_MASK_ADDR(nr, addr) + : "iq" ((u8)~CONST_MASK(nr))); + } else { + asm volatile(LOCK_PREFIX "btr %1,%0" + : BITOP_ADDR(addr) + : "Ir" (nr)); + } +} + +/* + * clear_bit_unlock - Clears a bit in memory + * @nr: Bit to clear + * @addr: Address to start counting from + * + * clear_bit() is atomic and implies release semantics before the memory + * operation. It can be used for an unlock. + */ +static inline void clear_bit_unlock(unsigned nr, volatile unsigned long *addr) +{ + barrier(); + clear_bit(nr, addr); +} + +static inline void __clear_bit(int nr, volatile unsigned long *addr) +{ + asm volatile("btr %1,%0" : ADDR : "Ir" (nr)); +} + +/* + * __clear_bit_unlock - Clears a bit in memory + * @nr: Bit to clear + * @addr: Address to start counting from + * + * __clear_bit() is non-atomic and implies release semantics before the memory + * operation. It can be used for an unlock if no other CPUs can concurrently + * modify other bits in the word. + * + * No memory barrier is required here, because x86 cannot reorder stores past + * older loads. Same principle as spin_unlock. + */ +static inline void __clear_bit_unlock(unsigned nr, volatile unsigned long *addr) +{ + barrier(); + __clear_bit(nr, addr); +} + +#define smp_mb__before_clear_bit() barrier() +#define smp_mb__after_clear_bit() barrier() + +/** + * __change_bit - Toggle a bit in memory + * @nr: the bit to change + * @addr: the address to start counting from + * + * Unlike change_bit(), this function is non-atomic and may be reordered. + * If it's called on the same region of memory simultaneously, the effect + * may be that only one operation succeeds. + */ +static inline void __change_bit(int nr, volatile unsigned long *addr) +{ + asm volatile("btc %1,%0" : ADDR : "Ir" (nr)); +} + +/** + * change_bit - Toggle a bit in memory + * @nr: Bit to change + * @addr: Address to start counting from + * + * change_bit() is atomic and may not be reordered. + * Note that @nr may be almost arbitrarily large; this function is not + * restricted to acting on a single-word quantity. + */ +static inline void change_bit(int nr, volatile unsigned long *addr) +{ + asm volatile(LOCK_PREFIX "btc %1,%0" : ADDR : "Ir" (nr)); +} + +/** + * test_and_set_bit - Set a bit and return its old value + * @nr: Bit to set + * @addr: Address to count from + * + * This operation is atomic and cannot be reordered. + * It also implies a memory barrier. + */ +static inline int test_and_set_bit(int nr, volatile unsigned long *addr) +{ + int oldbit; + + asm volatile(LOCK_PREFIX "bts %2,%1\n\t" + "sbb %0,%0" : "=r" (oldbit), ADDR : "Ir" (nr) : "memory"); + + return oldbit; +} + +/** + * test_and_set_bit_lock - Set a bit and return its old value for lock + * @nr: Bit to set + * @addr: Address to count from + * + * This is the same as test_and_set_bit on x86. + */ +static inline int test_and_set_bit_lock(int nr, volatile unsigned long *addr) +{ + return test_and_set_bit(nr, addr); +} + +/** + * __test_and_set_bit - Set a bit and return its old value + * @nr: Bit to set + * @addr: Address to count from + * + * This operation is non-atomic and can be reordered. + * If two examples of this operation race, one can appear to succeed + * but actually fail. You must protect multiple accesses with a lock. + */ +static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) +{ + int oldbit; + + asm("bts %2,%1\n\t" + "sbb %0,%0" + : "=r" (oldbit), ADDR + : "Ir" (nr)); + return oldbit; +} + +/** + * test_and_clear_bit - Clear a bit and return its old value + * @nr: Bit to clear + * @addr: Address to count from + * + * This operation is atomic and cannot be reordered. + * It also implies a memory barrier. + */ +static inline int test_and_clear_bit(int nr, volatile unsigned long *addr) +{ + int oldbit; + + asm volatile(LOCK_PREFIX "btr %2,%1\n\t" + "sbb %0,%0" + : "=r" (oldbit), ADDR : "Ir" (nr) : "memory"); + + return oldbit; +} + +/** + * __test_and_clear_bit - Clear a bit and return its old value + * @nr: Bit to clear + * @addr: Address to count from + * + * This operation is non-atomic and can be reordered. + * If two examples of this operation race, one can appear to succeed + * but actually fail. You must protect multiple accesses with a lock. + */ +static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) +{ + int oldbit; + + asm volatile("btr %2,%1\n\t" + "sbb %0,%0" + : "=r" (oldbit), ADDR + : "Ir" (nr)); + return oldbit; +} + +/* WARNING: non atomic and it can be reordered! */ +static inline int __test_and_change_bit(int nr, volatile unsigned long *addr) +{ + int oldbit; + + asm volatile("btc %2,%1\n\t" + "sbb %0,%0" + : "=r" (oldbit), ADDR + : "Ir" (nr) : "memory"); + + return oldbit; +} + +/** + * test_and_change_bit - Change a bit and return its old value + * @nr: Bit to change + * @addr: Address to count from + * + * This operation is atomic and cannot be reordered. + * It also implies a memory barrier. + */ +static inline int test_and_change_bit(int nr, volatile unsigned long *addr) +{ + int oldbit; + + asm volatile(LOCK_PREFIX "btc %2,%1\n\t" + "sbb %0,%0" + : "=r" (oldbit), ADDR : "Ir" (nr) : "memory"); + + return oldbit; +} + +static inline int constant_test_bit(int nr, const volatile unsigned long *addr) +{ + return ((1UL << (nr % BITS_PER_LONG)) & + (((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0; +} + +static inline int variable_test_bit(int nr, volatile const unsigned long *addr) +{ + int oldbit; + + asm volatile("bt %2,%1\n\t" + "sbb %0,%0" + : "=r" (oldbit) + : "m" (*(unsigned long *)addr), "Ir" (nr)); + + return oldbit; +} + +#if 0 /* Fool kernel-doc since it doesn't do macros yet */ +/** + * test_bit - Determine whether a bit is set + * @nr: bit number to test + * @addr: Address to start counting from + */ +static int test_bit(int nr, const volatile unsigned long *addr); +#endif + +#define test_bit(nr, addr) \ + (__builtin_constant_p((nr)) \ + ? constant_test_bit((nr), (addr)) \ + : variable_test_bit((nr), (addr))) + +/** + * __ffs - find first set bit in word + * @word: The word to search + * + * Undefined if no bit exists, so code should check against 0 first. + */ +static inline unsigned long __ffs(unsigned long word) +{ + asm("bsf %1,%0" + : "=r" (word) + : "rm" (word)); + return word; +} + +/** + * ffz - find first zero bit in word + * @word: The word to search + * + * Undefined if no zero exists, so code should check against ~0UL first. + */ +static inline unsigned long ffz(unsigned long word) +{ + asm("bsf %1,%0" + : "=r" (word) + : "r" (~word)); + return word; +} + +/* + * __fls: find last set bit in word + * @word: The word to search + * + * Undefined if no set bit exists, so code should check against 0 first. + */ +static inline unsigned long __fls(unsigned long word) +{ + asm("bsr %1,%0" + : "=r" (word) + : "rm" (word)); + return word; +} + +#ifdef __KERNEL__ +/** + * ffs - find first set bit in word + * @x: the word to search + * + * This is defined the same way as the libc and compiler builtin ffs + * routines, therefore differs in spirit from the other bitops. + * + * ffs(value) returns 0 if value is 0 or the position of the first + * set bit if value is nonzero. The first (least significant) bit + * is at position 1. + */ +static inline int ffs(int x) +{ + int r; +#ifdef CONFIG_X86_CMOV + asm("bsfl %1,%0\n\t" + "cmovzl %2,%0" + : "=r" (r) : "rm" (x), "r" (-1)); +#else + asm("bsfl %1,%0\n\t" + "jnz 1f\n\t" + "movl $-1,%0\n" + "1:" : "=r" (r) : "rm" (x)); +#endif + return r + 1; +} + +/** + * fls - find last set bit in word + * @x: the word to search + * + * This is defined in a similar way as the libc and compiler builtin + * ffs, but returns the position of the most significant set bit. + * + * fls(value) returns 0 if value is 0 or the position of the last + * set bit if value is nonzero. The last (most significant) bit is + * at position 32. + */ +static inline int fls(int x) +{ + int r; +#ifdef CONFIG_X86_CMOV + asm("bsrl %1,%0\n\t" + "cmovzl %2,%0" + : "=&r" (r) : "rm" (x), "rm" (-1)); +#else + asm("bsrl %1,%0\n\t" + "jnz 1f\n\t" + "movl $-1,%0\n" + "1:" : "=r" (r) : "rm" (x)); +#endif + return r + 1; +} +#endif /* __KERNEL__ */ + +#undef ADDR + +#ifdef __KERNEL__ + +#include + +#define ARCH_HAS_FAST_MULTIPLIER 1 + +#include + +#endif /* __KERNEL__ */ + +#include + +#ifdef __KERNEL__ + +#include + +#define ext2_set_bit_atomic(lock, nr, addr) \ + test_and_set_bit((nr), (unsigned long *)(addr)) +#define ext2_clear_bit_atomic(lock, nr, addr) \ + test_and_clear_bit((nr), (unsigned long *)(addr)) + +#include + +#endif /* __KERNEL__ */ +#endif /* ASM_X86__BITOPS_H */ diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h new file mode 100644 index 00000000000..1d63bd5d594 --- /dev/null +++ b/arch/x86/include/asm/boot.h @@ -0,0 +1,26 @@ +#ifndef ASM_X86__BOOT_H +#define ASM_X86__BOOT_H + +/* Don't touch these, unless you really know what you're doing. */ +#define DEF_SYSSEG 0x1000 +#define DEF_SYSSIZE 0x7F00 + +/* Internal svga startup constants */ +#define NORMAL_VGA 0xffff /* 80x25 mode */ +#define EXTENDED_VGA 0xfffe /* 80x50 mode */ +#define ASK_VGA 0xfffd /* ask for it at bootup */ + +/* Physical address where kernel should be loaded. */ +#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ + + (CONFIG_PHYSICAL_ALIGN - 1)) \ + & ~(CONFIG_PHYSICAL_ALIGN - 1)) + +#ifdef CONFIG_X86_64 +#define BOOT_HEAP_SIZE 0x7000 +#define BOOT_STACK_SIZE 0x4000 +#else +#define BOOT_HEAP_SIZE 0x4000 +#define BOOT_STACK_SIZE 0x1000 +#endif + +#endif /* ASM_X86__BOOT_H */ diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h new file mode 100644 index 00000000000..ccf027e2d97 --- /dev/null +++ b/arch/x86/include/asm/bootparam.h @@ -0,0 +1,111 @@ +#ifndef ASM_X86__BOOTPARAM_H +#define ASM_X86__BOOTPARAM_H + +#include +#include +#include +#include +#include +#include +#include