summaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile5
-rw-r--r--arch/x86/kernel/acpi/boot.c12
-rw-r--r--arch/x86/kernel/amd_iommu.c51
-rw-r--r--arch/x86/kernel/amd_iommu_init.c6
-rw-r--r--arch/x86/kernel/apic.c14
-rw-r--r--arch/x86/kernel/apm_32.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c18
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.h17
-rw-r--r--arch/x86/kernel/cpu/intel.c3
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c45
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c108
-rw-r--r--arch/x86/kernel/crash.c70
-rw-r--r--arch/x86/kernel/ds.c741
-rw-r--r--arch/x86/kernel/dumpstack.c351
-rw-r--r--arch/x86/kernel/dumpstack.h39
-rw-r--r--arch/x86/kernel/dumpstack_32.c307
-rw-r--r--arch/x86/kernel/dumpstack_64.c289
-rw-r--r--arch/x86/kernel/early-quirks.c18
-rw-r--r--arch/x86/kernel/entry_32.S51
-rw-r--r--arch/x86/kernel/entry_64.S83
-rw-r--r--arch/x86/kernel/es7000_32.c71
-rw-r--r--arch/x86/kernel/ftrace.c390
-rw-r--r--arch/x86/kernel/genapic_64.c4
-rw-r--r--arch/x86/kernel/genapic_flat_64.c105
-rw-r--r--arch/x86/kernel/genx2apic_cluster.c77
-rw-r--r--arch/x86/kernel/genx2apic_phys.c72
-rw-r--r--arch/x86/kernel/genx2apic_uv_x.c59
-rw-r--r--arch/x86/kernel/hpet.c12
-rw-r--r--arch/x86/kernel/i387.c2
-rw-r--r--arch/x86/kernel/i8253.c2
-rw-r--r--arch/x86/kernel/io_apic.c1050
-rw-r--r--arch/x86/kernel/ipi.c28
-rw-r--r--arch/x86/kernel/irq.c3
-rw-r--r--arch/x86/kernel/irq_32.c15
-rw-r--r--arch/x86/kernel/irq_64.c17
-rw-r--r--arch/x86/kernel/irqinit_32.c3
-rw-r--r--arch/x86/kernel/irqinit_64.c3
-rw-r--r--arch/x86/kernel/kvmclock.c2
-rw-r--r--arch/x86/kernel/machine_kexec_32.c104
-rw-r--r--arch/x86/kernel/mfgpt_32.c2
-rw-r--r--arch/x86/kernel/mpparse.c24
-rw-r--r--arch/x86/kernel/numaq_32.c10
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c3
-rw-r--r--arch/x86/kernel/pci-calgary_64.c2
-rw-r--r--arch/x86/kernel/pci-gart_64.c2
-rw-r--r--arch/x86/kernel/process.c32
-rw-r--r--arch/x86/kernel/process_32.c4
-rw-r--r--arch/x86/kernel/process_64.c4
-rw-r--r--arch/x86/kernel/ptrace.c98
-rw-r--r--arch/x86/kernel/reboot.c132
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S115
-rw-r--r--arch/x86/kernel/setup.c31
-rw-r--r--arch/x86/kernel/setup_percpu.c19
-rw-r--r--arch/x86/kernel/smp.c21
-rw-r--r--arch/x86/kernel/smpboot.c27
-rw-r--r--arch/x86/kernel/stacktrace.c64
-rw-r--r--arch/x86/kernel/tlb_32.c2
-rw-r--r--arch/x86/kernel/tlb_64.c2
-rw-r--r--arch/x86/kernel/tsc_sync.c4
-rw-r--r--arch/x86/kernel/vmi_32.c16
-rw-r--r--arch/x86/kernel/vmiclock_32.c2
-rw-r--r--arch/x86/kernel/vsyscall_64.c3
-rw-r--r--arch/x86/kernel/xsave.c2
64 files changed, 2897 insertions, 1979 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index e489ff9cb3e..1cad9318d21 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -25,7 +25,7 @@ CFLAGS_tsc.o := $(nostackp)
obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
-obj-y += time_$(BITS).o ioport.o ldt.o
+obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o
obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
obj-$(CONFIG_X86_VISWS) += visws_quirks.o
obj-$(CONFIG_X86_32) += probe_roms_32.o
@@ -41,7 +41,7 @@ obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
obj-y += process.o
obj-y += i387.o xsave.o
obj-y += ptrace.o
-obj-y += ds.o
+obj-$(CONFIG_X86_DS) += ds.o
obj-$(CONFIG_X86_32) += tls.o
obj-$(CONFIG_IA32_EMULATION) += tls.o
obj-y += step.o
@@ -65,6 +65,7 @@ obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
obj-$(CONFIG_X86_IO_APIC) += io_apic.o
obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
+obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 8c1f76abae9..65d0b72777e 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1343,7 +1343,6 @@ static void __init acpi_process_madt(void)
error = acpi_parse_madt_ioapic_entries();
if (!error) {
acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
- acpi_irq_balance_set(NULL);
acpi_ioapic = 1;
smp_found_config = 1;
@@ -1361,6 +1360,17 @@ static void __init acpi_process_madt(void)
disable_acpi();
}
}
+
+ /*
+ * ACPI supports both logical (e.g. Hyper-Threading) and physical
+ * processors, where MPS only supports physical.
+ */
+ if (acpi_lapic && acpi_ioapic)
+ printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
+ "information\n");
+ else if (acpi_lapic)
+ printk(KERN_INFO "Using ACPI for processor (LAPIC) "
+ "configuration information\n");
#endif
return;
}
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 331b318304e..a7b6dec6fc3 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -187,6 +187,8 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
spin_lock_irqsave(&iommu->lock, flags);
ret = __iommu_queue_command(iommu, cmd);
+ if (!ret)
+ iommu->need_sync = 1;
spin_unlock_irqrestore(&iommu->lock, flags);
return ret;
@@ -210,10 +212,13 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
- iommu->need_sync = 0;
-
spin_lock_irqsave(&iommu->lock, flags);
+ if (!iommu->need_sync)
+ goto out;
+
+ iommu->need_sync = 0;
+
ret = __iommu_queue_command(iommu, &cmd);
if (ret)
@@ -254,8 +259,6 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
ret = iommu_queue_command(iommu, &cmd);
- iommu->need_sync = 1;
-
return ret;
}
@@ -281,8 +284,6 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
ret = iommu_queue_command(iommu, &cmd);
- iommu->need_sync = 1;
-
return ret;
}
@@ -343,7 +344,7 @@ static int iommu_map(struct protection_domain *dom,
u64 __pte, *pte, *page;
bus_addr = PAGE_ALIGN(bus_addr);
- phys_addr = PAGE_ALIGN(bus_addr);
+ phys_addr = PAGE_ALIGN(phys_addr);
/* only support 512GB address spaces for now */
if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK))
@@ -537,7 +538,7 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
address >>= PAGE_SHIFT;
iommu_area_free(dom->bitmap, address, pages);
- if (address + pages >= dom->next_bit)
+ if (address >= dom->next_bit)
dom->need_flush = true;
}
@@ -599,7 +600,7 @@ static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
continue;
p2 = IOMMU_PTE_PAGE(p1[i]);
- for (j = 0; j < 512; ++i) {
+ for (j = 0; j < 512; ++j) {
if (!IOMMU_PTE_PRESENT(p2[j]))
continue;
p3 = IOMMU_PTE_PAGE(p2[j]);
@@ -762,8 +763,6 @@ static void set_device_domain(struct amd_iommu *iommu,
write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
iommu_queue_inv_dev_entry(iommu, devid);
-
- iommu->need_sync = 1;
}
/*****************************************************************************
@@ -858,6 +857,9 @@ static int get_device_resources(struct device *dev,
print_devid(_bdf, 1);
}
+ if (domain_for_device(_bdf) == NULL)
+ set_device_domain(*iommu, *domain, _bdf);
+
return 1;
}
@@ -908,7 +910,7 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
if (address >= dom->aperture_size)
return;
- WARN_ON(address & 0xfffULL || address > dom->aperture_size);
+ WARN_ON(address & ~PAGE_MASK || address >= dom->aperture_size);
pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
pte += IOMMU_PTE_L0_INDEX(address);
@@ -920,8 +922,8 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
/*
* This function contains common code for mapping of a physically
- * contiguous memory region into DMA address space. It is uses by all
- * mapping functions provided by this IOMMU driver.
+ * contiguous memory region into DMA address space. It is used by all
+ * mapping functions provided with this IOMMU driver.
* Must be called with the domain lock held.
*/
static dma_addr_t __map_single(struct device *dev,
@@ -981,7 +983,8 @@ static void __unmap_single(struct amd_iommu *iommu,
dma_addr_t i, start;
unsigned int pages;
- if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size))
+ if ((dma_addr == bad_dma_address) ||
+ (dma_addr + size > dma_dom->aperture_size))
return;
pages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
@@ -1031,8 +1034,7 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
if (addr == bad_dma_address)
goto out;
- if (unlikely(iommu->need_sync))
- iommu_completion_wait(iommu);
+ iommu_completion_wait(iommu);
out:
spin_unlock_irqrestore(&domain->lock, flags);
@@ -1060,8 +1062,7 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
__unmap_single(iommu, domain->priv, dma_addr, size, dir);
- if (unlikely(iommu->need_sync))
- iommu_completion_wait(iommu);
+ iommu_completion_wait(iommu);
spin_unlock_irqrestore(&domain->lock, flags);
}
@@ -1127,8 +1128,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
goto unmap;
}
- if (unlikely(iommu->need_sync))
- iommu_completion_wait(iommu);
+ iommu_completion_wait(iommu);
out:
spin_unlock_irqrestore(&domain->lock, flags);
@@ -1173,8 +1173,7 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
s->dma_address = s->dma_length = 0;
}
- if (unlikely(iommu->need_sync))
- iommu_completion_wait(iommu);
+ iommu_completion_wait(iommu);
spin_unlock_irqrestore(&domain->lock, flags);
}
@@ -1225,8 +1224,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
goto out;
}
- if (unlikely(iommu->need_sync))
- iommu_completion_wait(iommu);
+ iommu_completion_wait(iommu);
out:
spin_unlock_irqrestore(&domain->lock, flags);
@@ -1257,8 +1255,7 @@ static void free_coherent(struct device *dev, size_t size,
__unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
- if (unlikely(iommu->need_sync))
- iommu_completion_wait(iommu);
+ iommu_completion_wait(iommu);
spin_unlock_irqrestore(&domain->lock, flags);
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 0cdcda35a05..30ae2701b3d 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -121,7 +121,7 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have
LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
we find in ACPI */
unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
-int amd_iommu_isolate; /* if 1, device isolation is enabled */
+int amd_iommu_isolate = 1; /* if 1, device isolation is enabled */
bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
@@ -1213,7 +1213,9 @@ static int __init parse_amd_iommu_options(char *str)
for (; *str; ++str) {
if (strncmp(str, "isolate", 7) == 0)
amd_iommu_isolate = 1;
- if (strncmp(str, "fullflush", 11) == 0)
+ if (strncmp(str, "share", 5) == 0)
+ amd_iommu_isolate = 0;
+ if (strncmp(str, "fullflush", 9) == 0)
amd_iommu_unmap_flush = true;
}
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 1d410ee4b06..93cf2d13f33 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -141,7 +141,7 @@ static int lapic_next_event(unsigned long delta,
struct clock_event_device *evt);
static void lapic_timer_setup(enum clock_event_mode mode,
struct clock_event_device *evt);
-static void lapic_timer_broadcast(cpumask_t mask);
+static void lapic_timer_broadcast(const cpumask_t *mask);
static void apic_pm_activate(void);
/*
@@ -454,7 +454,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
/*
* Local APIC timer broadcast function
*/
-static void lapic_timer_broadcast(cpumask_t mask)
+static void lapic_timer_broadcast(const cpumask_t *mask)
{
#ifdef CONFIG_SMP
send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
@@ -470,7 +470,7 @@ static void __cpuinit setup_APIC_timer(void)
struct clock_event_device *levt = &__get_cpu_var(lapic_events);
memcpy(levt, &lapic_clockevent, sizeof(*levt));
- levt->cpumask = cpumask_of_cpu(smp_processor_id());
+ levt->cpumask = cpumask_of(smp_processor_id());
clockevents_register_device(levt);
}
@@ -1309,7 +1309,7 @@ void enable_x2apic(void)
}
}
-void enable_IR_x2apic(void)
+void __init enable_IR_x2apic(void)
{
#ifdef CONFIG_INTR_REMAP
int ret;
@@ -1890,8 +1890,8 @@ void __cpuinit generic_processor_info(int apicid, int version)
}
#endif
- cpu_set(cpu, cpu_possible_map);
- cpu_set(cpu, cpu_present_map);
+ set_cpu_possible(cpu, true);
+ set_cpu_present(cpu, true);
}
#ifdef CONFIG_X86_64
@@ -2093,7 +2093,7 @@ __cpuinit int apic_is_clustered_box(void)
bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
- for (i = 0; i < NR_CPUS; i++) {
+ for (i = 0; i < nr_cpu_ids; i++) {
/* are we being called early in kernel startup? */
if (bios_cpu_apicid) {
id = bios_cpu_apicid[i];
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 5145a6e72bb..3a26525a3f3 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -391,11 +391,7 @@ static int power_off;
#else
static int power_off = 1;
#endif
-#ifdef CONFIG_APM_REAL_MODE_POWER_OFF
-static int realmode_power_off = 1;
-#else
static int realmode_power_off;
-#endif
#ifdef CONFIG_APM_ALLOW_INTS
static int allow_ints = 1;
#else
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 8e48c5d4467..88ea02dcb62 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -33,6 +33,7 @@
#include <linux/cpufreq.h>
#include <linux/compiler.h>
#include <linux/dmi.h>
+#include <linux/ftrace.h>
#include <linux/acpi.h>
#include <acpi/processor.h>
@@ -391,6 +392,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
unsigned int next_perf_state = 0; /* Index into perf table */
unsigned int i;
int result = 0;
+ struct power_trace it;
dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
@@ -427,6 +429,8 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
}
}
+ trace_power_mark(&it, POWER_PSTATE, next_perf_state);
+
switch (data->cpu_feature) {
case SYSTEM_INTEL_MSR_CAPABLE:
cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index d3dcd58b87c..7f05f44b97e 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -115,9 +115,20 @@ static int query_current_values_with_pending_wait(struct powernow_k8_data *data)
u32 i = 0;
if (cpu_family == CPU_HW_PSTATE) {
- rdmsr(MSR_PSTATE_STATUS, lo, hi);
- i = lo & HW_PSTATE_MASK;
- data->currpstate = i;
+ if (data->currpstate == HW_PSTATE_INVALID) {
+ /* read (initial) hw pstate if not yet set */
+ rdmsr(MSR_PSTATE_STATUS, lo, hi);
+ i = lo & HW_PSTATE_MASK;
+
+ /*
+ * a workaround for family 11h erratum 311 might cause
+ * an "out-of-range Pstate if the core is in Pstate-0
+ */
+ if (i >= data->numps)
+ data->currpstate = HW_PSTATE_0;
+ else
+ data->currpstate = i;
+ }
return 0;
}
do {
@@ -1121,6 +1132,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
}
data->cpu = pol->cpu;
+ data->currpstate = HW_PSTATE_INVALID;
if (powernow_k8_cpu_init_acpi(data)) {
/*
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
index ab48cfed4d9..65cfb5d7f77 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
@@ -5,6 +5,19 @@
* http://www.gnu.org/licenses/gpl.html
*/
+
+enum pstate {
+ HW_PSTATE_INVALID = 0xff,
+ HW_PSTATE_0 = 0,
+ HW_PSTATE_1 = 1,
+ HW_PSTATE_2 = 2,
+ HW_PSTATE_3 = 3,
+ HW_PSTATE_4 = 4,
+ HW_PSTATE_5 = 5,
+ HW_PSTATE_6 = 6,
+ HW_PSTATE_7 = 7,
+};
+
struct powernow_k8_data {
unsigned int cpu;
@@ -23,7 +36,9 @@ struct powernow_k8_data {
u32 exttype; /* extended interface = 1 */
/* keep track of the current fid / vid or pstate */
- u32 currvid, currfid, currpstate;
+ u32 currvid;
+ u32 currfid;
+ enum pstate currpstate;
/* the powernow_table includes all frequency and vid/fid pairings:
* fid are the lower 8 bits of the index, vid are the upper 8 bits.
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index cce0b6118d5..816f27f289b 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -307,12 +307,11 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_P4);
if (c->x86 == 6)
set_cpu_cap(c, X86_FEATURE_P3);
+#endif
if (cpu_has_bts)
ptrace_bts_init_intel(c);
-#endif
-
detect_extended_topology(c);
if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
/*
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 3f46afbb1cf..fb7f946cb65 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -534,31 +534,16 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
per_cpu(cpuid4_info, cpu) = NULL;
}
-static int __cpuinit detect_cache_attributes(unsigned int cpu)
+static void get_cpu_leaves(void *_retval)
{
- struct _cpuid4_info *this_leaf;
- unsigned long j;
- int retval;
- cpumask_t oldmask;
-
- if (num_cache_leaves == 0)
- return -ENOENT;
-
- per_cpu(cpuid4_info, cpu) = kzalloc(
- sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
- if (per_cpu(cpuid4_info, cpu) == NULL)
- return -ENOMEM;
-
- oldmask = current->cpus_allowed;
- retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
- if (retval)
- goto out;
+ int j, *retval = _retval, cpu = smp_processor_id();
/* Do cpuid and store the results */
for (j = 0; j < num_cache_leaves; j++) {
+ struct _cpuid4_info *this_leaf;
this_leaf = CPUID4_INFO_IDX(cpu, j);
- retval = cpuid4_cache_lookup(j, this_leaf);
- if (unlikely(retval < 0)) {
+ *retval = cpuid4_cache_lookup(j, this_leaf);
+ if (unlikely(*retval < 0)) {
int i;
for (i = 0; i < j; i++)
@@ -567,9 +552,21 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
}
cache_shared_cpu_map_setup(cpu, j);
}
- set_cpus_allowed_ptr(current, &oldmask);
+}
+
+static int __cpuinit detect_cache_attributes(unsigned int cpu)
+{
+ int retval;
+
+ if (num_cache_leaves == 0)
+ return -ENOENT;
+
+ per_cpu(cpuid4_info, cpu) = kzalloc(
+ sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
+ if (per_cpu(cpuid4_info, cpu) == NULL)
+ return -ENOMEM;
-out:
+ smp_call_function_single(cpu, get_cpu_leaves, &retval, true);
if (retval) {
kfree(per_cpu(cpuid4_info, cpu));
per_cpu(cpuid4_info, cpu) = NULL;
@@ -626,8 +623,8 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
cpumask_t *mask = &this_leaf->shared_cpu_map;
n = type?
- cpulist_scnprintf(buf, len-2, *mask):
- cpumask_scnprintf(buf, len-2, *mask);
+ cpulist_scnprintf(buf, len-2, mask) :
+ cpumask_scnprintf(buf, len-2, mask);
buf[n++] = '\n';
buf[n] = '\0';
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 5eb390a4b2e..a1de80f368f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -83,34 +83,41 @@ static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
* CPU Initialization
*/
+struct thresh_restart {
+ struct threshold_block *b;
+ int reset;
+ u16 old_limit;
+};
+
/* must be called with correct cpu affinity */
-static void threshold_restart_bank(struct threshold_block *b,
- int reset, u16 old_limit)
+static long threshold_restart_bank(void *_tr)
{
+ struct thresh_restart *tr = _tr;
u32 mci_misc_hi, mci_misc_lo;
- rdmsr(b->address, mci_misc_lo, mci_misc_hi);
+ rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
- if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
- reset = 1; /* limit cannot be lower than err count */
+ if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
+ tr->reset = 1; /* limit cannot be lower than err count */
- if (reset) { /* reset err count and overflow bit */
+ if (tr->reset) { /* reset err count and overflow bit */
mci_misc_hi =
(mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
- (THRESHOLD_MAX - b->threshold_limit);
- } else if (old_limit) { /* change limit w/o reset */
+ (THRESHOLD_MAX - tr->b->threshold_limit);
+ } else if (tr->old_limit) { /* change limit w/o reset */
int new_count = (mci_misc_hi & THRESHOLD_MAX) +
- (old_limit - b->threshold_limit);
+ (tr->old_limit - tr->b->threshold_limit);
mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
(new_count & THRESHOLD_MAX);
}
- b->interrupt_enable ?
+ tr->b->interrupt_enable ?
(mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
(mci_misc_hi &= ~MASK_INT_TYPE_HI);
mci_misc_hi |= MASK_COUNT_EN_HI;
- wrmsr(b->address, mci_misc_lo, mci_misc_hi);
+ wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
+ return 0;
}
/* cpu init entry point, called from mce.c with preempt off */
@@ -120,6 +127,7 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
unsigned int cpu = smp_processor_id();
u8 lvt_off;
u32 low = 0, high = 0, address = 0;
+ struct thresh_restart tr;
for (bank = 0; bank < NR_BANKS; ++bank) {
for (block = 0; block < NR_BLOCKS; ++block) {
@@ -162,7 +170,10 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
wrmsr(address, low, high);
threshold_defaults.address = address;
- threshold_restart_bank(&threshold_defaults, 0, 0);
+ tr.b = &threshold_defaults;
+ tr.reset = 0;
+ tr.old_limit = 0;
+ threshold_restart_bank(&tr);
}
}
}
@@ -251,20 +262,6 @@ struct threshold_attr {
ssize_t(*store) (struct threshold_block *, const char *, size_t count);
};
-static void affinity_set(unsigned int cpu, cpumask_t *oldmask,
- cpumask_t *newmask)
-{
- *oldmask = current->cpus_allowed;
- cpus_clear(*newmask);
- cpu_set(cpu, *newmask);
- set_cpus_allowed_ptr(current, newmask);
-}
-
-static void affinity_restore(const cpumask_t *oldmask)
-{
- set_cpus_allowed_ptr(current, oldmask);
-}
-
#define SHOW_FIELDS(name) \
static ssize_t show_ ## name(struct threshold_block * b, char *buf) \
{ \
@@ -277,15 +274,16 @@ static ssize_t store_interrupt_enable(struct threshold_block *b,
const char *buf, size_t count)
{
char *end;
- cpumask_t oldmask, newmask;
+ struct thresh_restart tr;
unsigned long new = simple_strtoul(buf, &end, 0);
if (end == buf)
return -EINVAL;
b->interrupt_enable = !!new;
- affinity_set(b->cpu, &oldmask, &newmask);
- threshold_restart_bank(b, 0, 0);
- affinity_restore(&oldmask);
+ tr.b = b;
+ tr.reset = 0;
+ tr.old_limit = 0;
+ work_on_cpu(b->cpu, threshold_restart_bank, &tr);
return end - buf;
}
@@ -294,8 +292,7 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
const char *buf, size_t count)
{
char *end;
- cpumask_t oldmask, newmask;
- u16 old;
+ struct thresh_restart tr;
unsigned long new = simple_strtoul(buf, &end, 0);
if (end == buf)
return -EINVAL;
@@ -303,34 +300,36 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
new = THRESHOLD_MAX;
if (new < 1)
new = 1;
- old = b->threshold_limit;
+ tr.old_limit = b->threshold_limit;
b->threshold_limit = new;
+ tr.b = b;
+ tr.reset = 0;
- affinity_set(b->cpu, &oldmask, &newmask);
- threshold_restart_bank(b, 0, old);
- affinity_restore(&oldmask);
+ work_on_cpu(b->cpu, threshold_restart_bank, &tr);
return end - buf;
}
-static ssize_t show_error_count(struct threshold_block *b, char *buf)
+static long local_error_count(void *_b)
{
- u32 high, low;
- cpumask_t oldmask, newmask;
- affinity_set(b->cpu, &oldmask, &newmask);
+ struct threshold_block *b = _b;
+ u32 low, high;
+
rdmsr(b->address, low, high);
- affinity_restore(&oldmask);
- return sprintf(buf, "%x\n",
- (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit));
+ return (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
+}
+
+static ssize_t show_error_count(struct threshold_block *b, char *buf)
+{
+ return sprintf(buf, "%lx\n", work_on_cpu(b->cpu, local_error_count, b));
}
static ssize_t store_error_count(struct threshold_block *b,
const char *buf, size_t count)
{
- cpumask_t oldmask, newmask;
- affinity_set(b->cpu, &oldmask, &newmask);
- threshold_restart_bank(b, 1, 0);
- affinity_restore(&oldmask);
+ struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 };
+
+ work_on_cpu(b->cpu, threshold_restart_bank, &tr);
return 1;
}
@@ -463,12 +462,19 @@ out_free:
return err;
}
+static long local_allocate_threshold_blocks(void *_bank)
+{
+ unsigned int *bank = _bank;
+
+ return allocate_threshold_blocks(smp_processor_id(), *bank, 0,
+ MSR_IA32_MC0_MISC + *bank * 4);
+}
+
/* symlinks sibling shared banks to first core. first core owns dir/files. */
static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
{
int i, err = 0;
struct threshold_bank *b = NULL;
- cpumask_t oldmask, newmask;
char name[32];
sprintf(name, "threshold_bank%i", bank);
@@ -519,11 +525,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
per_cpu(threshold_banks, cpu)[bank] = b;
- affinity_set(cpu, &oldmask, &newmask);
- err = allocate_threshold_blocks(cpu, bank, 0,
- MSR_IA32_MC0_MISC + bank * 4);
- affinity_restore(&oldmask);
-
+ err = work_on_cpu(cpu, local_allocate_threshold_blocks, &bank);
if (err)
goto out_free;
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 26855381790..d84a852e4cd 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -29,34 +29,17 @@
#include <mach_ipi.h>
-/* This keeps a track of which one is crashing cpu. */
-static int crashing_cpu;
#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
-static atomic_t waiting_for_crash_ipi;
-static int crash_nmi_callback(struct notifier_block *self,
- unsigned long val, void *data)
+static void kdump_nmi_callback(int cpu, struct die_args *args)
{
struct pt_regs *regs;
#ifdef CONFIG_X86_32
struct pt_regs fixed_regs;
#endif
- int cpu;
- if (val != DIE_NMI_IPI)
- return NOTIFY_OK;
-
- regs = ((struct die_args *)data)->regs;
- cpu = raw_smp_processor_id();
-
- /* Don't do anything if this handler is invoked on crashing cpu.
- * Otherwise, system will completely hang. Crashing cpu can get
- * an NMI if system was initially booted with nmi_watchdog parameter.
- */
- if (cpu == crashing_cpu)
- return NOTIFY_STOP;
- local_irq_disable();
+ regs = args->regs;
#ifdef CONFIG_X86_32
if (!user_mode_vm(regs)) {
@@ -65,54 +48,19 @@ static int crash_nmi_callback(struct notifier_block *self,
}
#endif
crash_save_cpu(regs, cpu);
- disable_local_APIC();
- atomic_dec(&waiting_for_crash_ipi);
- /* Assume hlt works */
- halt();
- for (;;)
- cpu_relax();
-
- return 1;
-}
-static void smp_send_nmi_allbutself(void)
-{
- cpumask_t mask = cpu_online_map;
- cpu_clear(safe_smp_processor_id(), mask);
- if (!cpus_empty(mask))
- send_IPI_mask(mask, NMI_VECTOR);
+ disable_local_APIC();
}
-static struct notifier_block crash_nmi_nb = {
- .notifier_call = crash_nmi_callback,
-};
-
-static void nmi_shootdown_cpus(void)
+static void kdump_nmi_shootdown_cpus(void)
{
- unsigned long msecs;
-
- atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
- /* Would it be better to replace the trap vector here? */
- if (register_die_notifier(&crash_nmi_nb))
- return; /* return what? */
- /* Ensure the new callback function is set before sending
- * out the NMI
- */
- wmb();
+ nmi_shootdown_cpus(kdump_nmi_callback);
- smp_send_nmi_allbutself();
-
- msecs = 1000; /* Wait at most a second for the other cpus to stop */
- while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
- mdelay(1);
- msecs--;
- }
-
- /* Leave the nmi callback set */
disable_local_APIC();
}
+
#else
-static void nmi_shootdown_cpus(void)
+static void kdump_nmi_shootdown_cpus(void)
{
/* There are no cpus to shootdown */
}
@@ -131,9 +79,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
/* The kernel is broken so disable interrupts */
local_irq_disable();
- /* Make a note of crashing cpu. Will be used in NMI callback.*/
- crashing_cpu = safe_smp_processor_id();
- nmi_shootdown_cpus();
+ kdump_nmi_shootdown_cpus();
lapic_shutdown();
#if defined(CONFIG_X86_IO_APIC)
disable_IO_APIC();
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 2b69994fd3a..19a8c2c0389 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -7,13 +7,12 @@
*
* It manages:
* - per-thread and per-cpu allocation of BTS and PEBS
- * - buffer memory allocation (optional)
- * - buffer overflow handling
+ * - buffer overflow handling (to be done)
* - buffer access
*
* It assumes:
- * - get_task_struct on all parameter tasks
- * - current is allowed to trace parameter tasks
+ * - get_task_struct on all traced tasks
+ * - current is allowed to trace tasks
*
*
* Copyright (C) 2007-2008 Intel Corporation.
@@ -21,8 +20,6 @@
*/
-#ifdef CONFIG_X86_DS
-
#include <asm/ds.h>
#include <linux/errno.h>
@@ -30,6 +27,7 @@
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/mm.h>
+#include <linux/kernel.h>
/*
@@ -46,6 +44,33 @@ struct ds_configuration {
};
static struct ds_configuration ds_cfg;
+/*
+ * A BTS or PEBS tracer.
+ *
+ * This holds the configuration of the tracer and serves as a handle
+ * to identify tracers.
+ */
+struct ds_tracer {
+ /* the DS context (partially) owned by this tracer */
+ struct ds_context *context;
+ /* the buffer provided on ds_request() and its size in bytes */
+ void *buffer;
+ size_t size;
+};
+
+struct bts_tracer {
+ /* the common DS part */
+ struct ds_tracer ds;
+ /* buffer overflow notification function */
+ bts_ovfl_callback_t ovfl;
+};
+
+struct pebs_tracer {
+ /* the common DS part */
+ struct ds_tracer ds;
+ /* buffer overflow notification function */
+ pebs_ovfl_callback_t ovfl;
+};
/*
* Debug Store (DS) save area configuration (see Intel64 and IA32
@@ -109,34 +134,13 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
(*(unsigned long *)base) = value;
}
+#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */
-/*
- * Locking is done only for allocating BTS or PEBS resources and for
- * guarding context and buffer memory allocation.
- *
- * Most functions require the current task to own the ds context part
- * they are going to access. All the locking is done when validating
- * access to the context.
- */
-static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);
/*
- * Validate that the current task is allowed to access the BTS/PEBS
- * buffer of the parameter task.
- *
- * Returns 0, if access is granted; -Eerrno, otherwise.
+ * Locking is done only for allocating BTS or PEBS resources.
*/
-static inline int ds_validate_access(struct ds_context *context,
- enum ds_qualifier qual)
-{
- if (!context)
- return -EPERM;
-
- if (context->owner[qual] == current)
- return 0;
-
- return -EPERM;
-}
+static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);
/*
@@ -185,80 +189,43 @@ static inline int check_tracer(struct task_struct *task)
*
* Contexts are use-counted. They are allocated on first access and
* deallocated when the last user puts the context.
- *
- * We distinguish between an allocating and a non-allocating get of a
- * context:
- * - the allocating get is used for requesting BTS/PEBS resources. It
- * requires the caller to hold the global ds_lock.
- * - the non-allocating get is used for all other cases. A
- * non-existing context indicates an error. It acquires and releases
- * the ds_lock itself for obtaining the context.
- *
- * A context and its DS configuration are allocated and deallocated
- * together. A context always has a DS configuration of the
- * appropriate size.
*/
static DEFINE_PER_CPU(struct ds_context *, system_context);
#define this_system_context per_cpu(system_context, smp_processor_id())
-/*
- * Returns the pointer to the parameter task's context or to the
- * system-wide context, if task is NULL.
- *
- * Increases the use count of the returned context, if not NULL.
- */
static inline struct ds_context *ds_get_context(struct task_struct *task)
{
- struct ds_context *context;
-
- spin_lock(&ds_lock);
-
- context = (task ? task->thread.ds_ctx : this_system_context);
- if (context)
- context->count++;
-
- spin_unlock(&ds_lock);
-
- return context;
-}
-
-/*
- * Same as ds_get_context, but allocates the context and it's DS
- * structure, if necessary; returns NULL; if out of memory.
- *
- * pre: requires ds_lock to be held
- */
-static inline struct ds_context *ds_alloc_context(struct task_struct *task)
-{
struct ds_context **p_context =
(task ? &task->thread.ds_ctx : &this_system_context);
struct ds_context *context = *p_context;
+ unsigned long irq;
if (!context) {
context = kzalloc(sizeof(*context), GFP_KERNEL);
-
if (!context)
return NULL;
- context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
- if (!context->ds) {
- kfree(context);
- return NULL;
- }
+ spin_lock_irqsave(&ds_lock, irq);
- *p_context = context;
+ if (*p_context) {
+ kfree(context);
- context->this = p_context;
- context->task = task;
+ context = *p_context;
+ } else {
+ *p_context = context;
- if (task)
- set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
+ context->this = p_context;
+ context->task = task;
- if (!task || (task == current))
- wrmsr(MSR_IA32_DS_AREA, (unsigned long)context->ds, 0);
+ if (task)
+ set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
- get_tracer(task);
+ if (!task || (task == current))
+ wrmsrl(MSR_IA32_DS_AREA,
+ (unsigned long)context->ds);
+ }
+ spin_unlock_irqrestore(&ds_lock, irq);
}
context->count++;
@@ -266,16 +233,14 @@ static inline struct ds_context *ds_alloc_context(struct task_struct *task)
return context;
}
-/*
- * Decreases the use count of the parameter context, if not NULL.
- * Deallocates the context, if the use count reaches zero.
- */
static inline void ds_put_context(struct ds_context *context)
{
+ unsigned long irq;
+
if (!context)
return;
- spin_lock(&ds_lock);
+ spin_lock_irqsave(&ds_lock, irq);
if (--context->count)
goto out;
@@ -288,351 +253,351 @@ static inline void ds_put_context(struct ds_context *context)
if (!context->task || (context->task == current))
wrmsrl(MSR_IA32_DS_AREA, 0);
- put_tracer(context->task);
-
- /* free any leftover buffers from tracers that did not
- * deallocate them properly. */
- kfree(context->buffer[ds_bts]);
- kfree(context->buffer[ds_pebs]);
- kfree(context->ds);
kfree(context);
out:
- spin_unlock(&ds_lock);
+ spin_unlock_irqrestore(&ds_lock, irq);
}
/*
* Handle a buffer overflow
*
- * task: the task whose buffers are overflowing;
- * NULL for a buffer overflow on the current cpu
* context: the ds context
* qual: the buffer type
*/
-static void ds_overflow(struct task_struct *task, struct ds_context *context,
- enum ds_qualifier qual)
-{
- if (!context)
- return;
-
- if (context->callback[qual])
- (*context->callback[qual])(task);
-
- /* todo: do some more overflow handling */
+static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
+{
+ switch (qual) {
+ case ds_bts: {
+ struct bts_tracer *tracer =
+ container_of(context->owner[qual],
+ struct bts_tracer, ds);
+ if (tracer->ovfl)
+ tracer->ovfl(tracer);
+ }
+ break;
+ case ds_pebs: {
+ struct pebs_tracer *tracer =
+ container_of(context->owner[qual],
+ struct pebs_tracer, ds);
+ if (tracer->ovfl)
+ tracer->ovfl(tracer);
+ }
+ break;
+ }
}
-/*
- * Allocate a non-pageable buffer of the parameter size.
- * Checks the memory and the locked memory rlimit.
- *
- * Returns the buffer, if successful;
- * NULL, if out of memory or rlimit exceeded.
- *
- * size: the requested buffer size in bytes
- * pages (out): if not NULL, contains the number of pages reserved
- */
-static inline void *ds_allocate_buffer(size_t size, unsigned int *pages)
+static void ds_install_ds_config(struct ds_context *context,
+ enum ds_qualifier qual,
+ void *base, size_t size, size_t ith)
{
- unsigned long rlim, vm, pgsz;
- void *buffer;
-
- pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
-
- rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
- vm = current->mm->total_vm + pgsz;
- if (rlim < vm)
- return NULL;
+ unsigned long buffer, adj;
- rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
- vm = current->mm->locked_vm + pgsz;
- if (rlim < vm)
- return NULL;
+ /* adjust the buffer address and size to meet alignment
+ * constraints:
+ * - buffer is double-word aligned
+ * - size is multiple of record size
+ *
+ * We checked the size at the very beginning; we have enough
+ * space to do the adjustment.
+ */
+ buffer = (unsigned long)base;
- buffer = kzalloc(size, GFP_KERNEL);
- if (!buffer)
- return NULL;
+ adj = ALIGN(buffer, DS_ALIGNMENT) - buffer;
+ buffer += adj;
+ size -= adj;
- current->mm->total_vm += pgsz;
- current->mm->locked_vm += pgsz;
+ size /= ds_cfg.sizeof_rec[qual];
+ size *= ds_cfg.sizeof_rec[qual];
- if (pages)
- *pages = pgsz;
+ ds_set(context->ds, qual, ds_buffer_base, buffer);
+ ds_set(context->ds, qual, ds_index, buffer);
+ ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
- return buffer;
+ /* The value for 'no threshold' is -1, which will set the
+ * threshold outside of the buffer, just like we want it.
+ */
+ ds_set(context->ds, qual,
+ ds_interrupt_threshold, buffer + size - ith);
}
-static int ds_request(struct task_struct *task, void *base, size_t size,
- ds_ovfl_callback_t ovfl, enum ds_qualifier qual)
+static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual,
+ struct task_struct *task,
+ void *base, size_t size, size_t th)
{
struct ds_context *context;
- unsigned long buffer, adj;
- const unsigned long alignment = (1 << 3);
- int error = 0;
+ unsigned long irq;
+ int error;
+ error = -EOPNOTSUPP;
if (!ds_cfg.sizeof_ds)
- return -EOPNOTSUPP;
+ goto out;
- /* we require some space to do alignment adjustments below */
- if (size < (alignment + ds_cfg.sizeof_rec[qual]))
- return -EINVAL;
+ error = -EINVAL;
+ if (!base)
+ goto out;
- /* buffer overflow notification is not yet implemented */
- if (ovfl)
- return -EOPNOTSUPP;
+ /* we require some space to do alignment adjustments below */
+ error = -EINVAL;
+ if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual]))
+ goto out;
+ if (th != (size_t)-1) {
+ th *= ds_cfg.sizeof_rec[qual];
- spin_lock(&ds_lock);
+ error = -EINVAL;
+ if (size <= th)
+ goto out;
+ }
- if (!check_tracer(task))
- return -EPERM;
+ tracer->buffer = base;
+ tracer->size = size;
error = -ENOMEM;
- context = ds_alloc_context(task);
+ context = ds_get_context(task);
if (!context)
- goto out_unlock;
-
- error = -EALREADY;
- if (context->owner[qual] == current)
- goto out_unlock;
- error = -EPERM;
- if (context->owner[qual] != NULL)
- goto out_unlock;
- context->owner[qual] = current;
-
- spin_unlock(&ds_lock);
+ goto out;
+ tracer->context = context;
- error = -ENOMEM;
- if (!base) {
- base = ds_allocate_buffer(size, &context->pages[qual]);
- if (!base)
- goto out_release;
-
- context->buffer[qual] = base;
- }
- error = 0;
+ spin_lock_irqsave(&ds_lock, irq);
- context->callback[qual] = ovfl;
-
- /* adjust the buffer address and size to meet alignment
- * constraints:
- * - buffer is double-word aligned
- * - size is multiple of record size
- *
- * We checked the size at the very beginning; we have enough
- * space to do the adjustment.
- */
- buffer = (unsigned long)base;
+ error = -EPERM;
+ if (!check_tracer(task))
+ goto out_unlock;
+ get_tracer(task);
- adj = ALIGN(buffer, alignment) - buffer;
- buffer += adj;
- size -= adj;
+ error = -EPERM;
+ if (context->owner[qual])
+ goto out_put_tracer;
+ context->owner[qual] = tracer;
- size /= ds_cfg.sizeof_rec[qual];
- size *= ds_cfg.sizeof_rec[qual];
+ spin_unlock_irqrestore(&ds_lock, irq);
- ds_set(context->ds, qual, ds_buffer_base, buffer);
- ds_set(context->ds, qual, ds_index, buffer);
- ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
- if (ovfl) {
- /* todo: select a suitable interrupt threshold */
- } else
- ds_set(context->ds, qual,
- ds_interrupt_threshold, buffer + size + 1);
+ ds_install_ds_config(context, qual, base, size, th);
- /* we keep the context until ds_release */
- return error;
-
- out_release:
- context->owner[qual] = NULL;
- ds_put_context(context);
- return error;
+ return 0;
+ out_put_tracer:
+ put_tracer(task);
out_unlock:
- spin_unlock(&ds_lock);
+ spin_unlock_irqrestore(&ds_lock, irq);
ds_put_context(context);
+ tracer->context = NULL;
+ out:
return error;
}
-int ds_request_bts(struct task_struct *task, void *base, size_t size,
- ds_ovfl_callback_t ovfl)
+struct bts_tracer *ds_request_bts(struct task_struct *task,
+ void *base, size_t size,
+ bts_ovfl_callback_t ovfl, size_t th)
{
- return ds_request(task, base, size, ovfl, ds_bts);
-}
+ struct bts_tracer *tracer;
+ int error;
-int ds_request_pebs(struct task_struct *task, void *base, size_t size,
- ds_ovfl_callback_t ovfl)
-{
- return ds_request(task, base, size, ovfl, ds_pebs);
+ /* buffer overflow notification is not yet implemented */
+ error = -EOPNOTSUPP;
+ if (ovfl)
+ goto out;
+
+ error = -ENOMEM;
+ tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
+ if (!tracer)
+ goto out;
+ tracer->ovfl = ovfl;
+
+ error = ds_request(&tracer->ds, ds_bts, task, base, size, th);
+ if (error < 0)
+ goto out_tracer;
+
+ return tracer;
+
+ out_tracer:
+ kfree(tracer);
+ out:
+ return ERR_PTR(error);
}
-static int ds_release(struct task_struct *task, enum ds_qualifier qual)
+struct pebs_tracer *ds_request_pebs(struct task_struct *task,
+ void *base, size_t size,
+ pebs_ovfl_callback_t ovfl, size_t th)
{
- struct ds_context *context;
+ struct pebs_tracer *tracer;
int error;
- context = ds_get_context(task);
- error = ds_validate_access(context, qual);
- if (error < 0)
+ /* buffer overflow notification is not yet implemented */
+ error = -EOPNOTSUPP;
+ if (ovfl)
+ goto out;
+
+ error = -ENOMEM;
+ tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
+ if (!tracer)
goto out;
+ tracer->ovfl = ovfl;
- kfree(context->buffer[qual]);
- context->buffer[qual] = NULL;
+ error = ds_request(&tracer->ds, ds_pebs, task, base, size, th);
+ if (error < 0)
+ goto out_tracer;
- current->mm->total_vm -= context->pages[qual];
- current->mm->locked_vm -= context->pages[qual];
- context->pages[qual] = 0;
- context->owner[qual] = NULL;
+ return tracer;
- /*
- * we put the context twice:
- * once for the ds_get_context
- * once for the corresponding ds_request
- */
- ds_put_context(context);
+ out_tracer:
+ kfree(tracer);
out:
- ds_put_context(context);
- return error;
+ return ERR_PTR(error);
}
-int ds_release_bts(struct task_struct *task)
+static void ds_release(struct ds_tracer *tracer, enum ds_qualifier qual)
{
- return ds_release(task, ds_bts);
+ BUG_ON(tracer->context->owner[qual] != tracer);
+ tracer->context->owner[qual] = NULL;
+
+ put_tracer(tracer->context->task);
+ ds_put_context(tracer->context);
}
-int ds_release_pebs(struct task_struct *task)
+int ds_release_bts(struct bts_tracer *tracer)
{
- return ds_release(task, ds_pebs);
+ if (!tracer)
+ return -EINVAL;
+
+ ds_release(&tracer->ds, ds_bts);
+ kfree(tracer);
+
+ return 0;
}
-static int ds_get_index(struct task_struct *task, size_t *pos,
- enum ds_qualifier qual)
+int ds_release_pebs(struct pebs_tracer *tracer)
{
- struct ds_context *context;
- unsigned long base, index;
- int error;
+ if (!tracer)
+ return -EINVAL;
- context = ds_get_context(task);
- error = ds_validate_access(context, qual);
- if (error < 0)
- goto out;
+ ds_release(&tracer->ds, ds_pebs);
+ kfree(tracer);
+
+ return 0;
+}
+
+static size_t ds_get_index(struct ds_context *context, enum ds_qualifier qual)
+{
+ unsigned long base, index;
base = ds_get(context->ds, qual, ds_buffer_base);
index = ds_get(context->ds, qual, ds_index);
- error = ((index - base) / ds_cfg.sizeof_rec[qual]);
- if (pos)
- *pos = error;
- out:
- ds_put_context(context);
- return error;
+ return (index - base) / ds_cfg.sizeof_rec[qual];
}
-int ds_get_bts_index(struct task_struct *task, size_t *pos)
+int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos)
{
- return ds_get_index(task, pos, ds_bts);
+ if (!tracer)
+ return -EINVAL;
+
+ if (!pos)
+ return -EINVAL;
+
+ *pos = ds_get_index(tracer->ds.context, ds_bts);
+
+ return 0;
}
-int ds_get_pebs_index(struct task_struct *task, size_t *pos)
+int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos)
{
- return ds_get_index(task, pos, ds_pebs);
+ if (!tracer)
+ return -EINVAL;
+
+ if (!pos)
+ return -EINVAL;
+
+ *pos = ds_get_index(tracer->ds.context, ds_pebs);
+
+ return 0;
}
-static int ds_get_end(struct task_struct *task, size_t *pos,
- enum ds_qualifier qual)
+static size_t ds_get_end(struct ds_context *context, enum ds_qualifier qual)
{
- struct ds_context *context;
- unsigned long base, end;
- int error;
-
- context = ds_get_context(task);
- error = ds_validate_access(context, qual);
- if (error < 0)
- goto out;
+ unsigned long base, max;
base = ds_get(context->ds, qual, ds_buffer_base);
- end = ds_get(context->ds, qual, ds_absolute_maximum);
+ max = ds_get(context->ds, qual, ds_absolute_maximum);
- error = ((end - base) / ds_cfg.sizeof_rec[qual]);
- if (pos)
- *pos = error;
- out:
- ds_put_context(context);
- return error;
+ return (max - base) / ds_cfg.sizeof_rec[qual];
}
-int ds_get_bts_end(struct task_struct *task, size_t *pos)
+int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos)
{
- return ds_get_end(task, pos, ds_bts);
+ if (!tracer)
+ return -EINVAL;
+
+ if (!pos)
+ return -EINVAL;
+
+ *pos = ds_get_end(tracer->ds.context, ds_bts);
+
+ return 0;
}
-int ds_get_pebs_end(struct task_struct *task, size_t *pos)
+int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos)
{
- return ds_get_end(task, pos, ds_pebs);
+ if (!tracer)
+ return -EINVAL;
+
+ if (!pos)
+ return -EINVAL;
+
+ *pos = ds_get_end(tracer->ds.context, ds_pebs);
+
+ return 0;
}
-static int ds_access(struct task_struct *task, size_t index,
- const void **record, enum ds_qualifier qual)
+static int ds_access(struct ds_context *context, enum ds_qualifier qual,
+ size_t index, const void **record)
{
- struct ds_context *context;
unsigned long base, idx;
- int error;
if (!record)
return -EINVAL;
- context = ds_get_context(task);
- error = ds_validate_access(context, qual);
- if (error < 0)
- goto out;
-
base = ds_get(context->ds, qual, ds_buffer_base);
idx = base + (index * ds_cfg.sizeof_rec[qual]);
- error = -EINVAL;
if (idx > ds_get(context->ds, qual, ds_absolute_maximum))
- goto out;
+ return -EINVAL;
*record = (const void *)idx;
- error = ds_cfg.sizeof_rec[qual];
- out:
- ds_put_context(context);
- return error;
+
+ return ds_cfg.sizeof_rec[qual];
}
-int ds_access_bts(struct task_struct *task, size_t index, const void **record)
+int ds_access_bts(struct bts_tracer *tracer, size_t index,
+ const void **record)
{
- return ds_access(task, index, record, ds_bts);
+ if (!tracer)
+ return -EINVAL;
+
+ return ds_access(tracer->ds.context, ds_bts, index, record);
}
-int ds_access_pebs(struct task_struct *task, size_t index, const void **record)
+int ds_access_pebs(struct pebs_tracer *tracer, size_t index,
+ const void **record)
{
- return ds_access(task, index, record, ds_pebs);
+ if (!tracer)
+ return -EINVAL;
+
+ return ds_access(tracer->ds.context, ds_pebs, index, record);
}
-static int ds_write(struct task_struct *task, const void *record, size_t size,
- enum ds_qualifier qual, int force)
+static int ds_write(struct ds_context *context, enum ds_qualifier qual,
+ const void *record, size_t size)
{
- struct ds_context *context;
- int error;
+ int bytes_written = 0;
if (!record)
return -EINVAL;
- error = -EPERM;
- context = ds_get_context(task);
- if (!context)
- goto out;
-
- if (!force) {
- error = ds_validate_access(context, qual);
- if (error < 0)
- goto out;
- }
-
- error = 0;
while (size) {
unsigned long base, index, end, write_end, int_th;
unsigned long write_size, adj_write_size;
@@ -660,14 +625,14 @@ static int ds_write(struct task_struct *task, const void *record, size_t size,
write_end = end;
if (write_end <= index)
- goto out;
+ break;
write_size = min((unsigned long) size, write_end - index);
memcpy((void *)index, record, write_size);
record = (const char *)record + write_size;
- size -= write_size;
- error += write_size;
+ size -= write_size;
+ bytes_written += write_size;
adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
adj_write_size *= ds_cfg.sizeof_rec[qual];
@@ -682,47 +647,32 @@ static int ds_write(struct task_struct *task, const void *record, size_t size,
ds_set(context->ds, qual, ds_index, index);
if (index >= int_th)
- ds_overflow(task, context, qual);
+ ds_overflow(context, qual);
}
- out:
- ds_put_context(context);
- return error;
+ return bytes_written;
}
-int ds_write_bts(struct task_struct *task, const void *record, size_t size)
+int ds_write_bts(struct bts_tracer *tracer, const void *record, size_t size)
{
- return ds_write(task, record, size, ds_bts, /* force = */ 0);
-}
+ if (!tracer)
+ return -EINVAL;
-int ds_write_pebs(struct task_struct *task, const void *record, size_t size)
-{
- return ds_write(task, record, size, ds_pebs, /* force = */ 0);
+ return ds_write(tracer->ds.context, ds_bts, record, size);
}
-int ds_unchecked_write_bts(struct task_struct *task,
- const void *record, size_t size)
+int ds_write_pebs(struct pebs_tracer *tracer, const void *record, size_t size)
{
- return ds_write(task, record, size, ds_bts, /* force = */ 1);
-}
+ if (!tracer)
+ return -EINVAL;
-int ds_unchecked_write_pebs(struct task_struct *task,
- const void *record, size_t size)
-{
- return ds_write(task, record, size, ds_pebs, /* force = */ 1);
+ return ds_write(tracer->ds.context, ds_pebs, record, size);
}
-static int ds_reset_or_clear(struct task_struct *task,
- enum ds_qualifier qual, int clear)
+static void ds_reset_or_clear(struct ds_context *context,
+ enum ds_qualifier qual, int clear)
{
- struct ds_context *context;
unsigned long base, end;
- int error;
-
- context = ds_get_context(task);
- error = ds_validate_access(context, qual);
- if (error < 0)
- goto out;
base = ds_get(context->ds, qual, ds_buffer_base);
end = ds_get(context->ds, qual, ds_absolute_maximum);
@@ -731,89 +681,100 @@ static int ds_reset_or_clear(struct task_struct *task,
memset((void *)base, 0, end - base);
ds_set(context->ds, qual, ds_index, base);
-
- error = 0;
- out:
- ds_put_context(context);
- return error;
}
-int ds_reset_bts(struct task_struct *task)
+int ds_reset_bts(struct bts_tracer *tracer)
{
- return ds_reset_or_clear(task, ds_bts, /* clear = */ 0);
+ if (!tracer)
+ return -EINVAL;
+
+ ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 0);
+
+ return 0;
}
-int ds_reset_pebs(struct task_struct *task)
+int ds_reset_pebs(struct pebs_tracer *tracer)
{
- return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0);
+ if (!tracer)
+ return -EINVAL;
+
+ ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 0);
+
+ return 0;
}
-int ds_clear_bts(struct task_struct *task)
+int ds_clear_bts(struct bts_tracer *tracer)
{
- return ds_reset_or_clear(task, ds_bts, /* clear = */ 1);
+ if (!tracer)
+ return -EINVAL;
+
+ ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 1);
+
+ return 0;
}
-int ds_clear_pebs(struct task_struct *task)
+int ds_clear_pebs(struct pebs_tracer *tracer)
{
- return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1);
+ if (!tracer)
+ return -EINVAL;
+
+ ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 1);
+
+ return 0;
}
-int ds_get_pebs_reset(struct task_struct *task, u64 *value)
+int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value)
{
- struct ds_context *context;
- int error;
+ if (!tracer)
+ return -EINVAL;
if (!value)
return -EINVAL;
- context = ds_get_context(task);
- error = ds_validate_access(context, ds_pebs);
- if (error < 0)
- goto out;
+ *value = *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8));
- *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8));
-
- error = 0;
- out:
- ds_put_context(context);
- return error;
+ return 0;
}
-int ds_set_pebs_reset(struct task_struct *task, u64 value)
+int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value)
{
- struct ds_context *context;
- int error;
-
- context = ds_get_context(task);
- error = ds_validate_access(context, ds_pebs);
- if (error < 0)
- goto out;
+ if (!tracer)
+ return -EINVAL;
- *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value;
+ *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value;
- error = 0;
- out:
- ds_put_context(context);
- return error;
+ return 0;
}
static const struct ds_configuration ds_cfg_var = {
.sizeof_ds = sizeof(long) * 12,
.sizeof_field = sizeof(long),
.sizeof_rec[ds_bts] = sizeof(long) * 3,
+#ifdef __i386__
.sizeof_rec[ds_pebs] = sizeof(long) * 10
+#else
+ .sizeof_rec[ds_pebs] = sizeof(long) * 18
+#endif
};
static const struct ds_configuration ds_cfg_64 = {
.sizeof_ds = 8 * 12,
.sizeof_field = 8,
.sizeof_rec[ds_bts] = 8 * 3,
+#ifdef __i386__
.sizeof_rec[ds_pebs] = 8 * 10
+#else
+ .sizeof_rec[ds_pebs] = 8 * 18
+#endif
};
static inline void
ds_configure(const struct ds_configuration *cfg)
{
ds_cfg = *cfg;
+
+ printk(KERN_INFO "DS available\n");
+
+ BUG_ON(MAX_SIZEOF_DS < ds_cfg.sizeof_ds);
}
void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
@@ -821,17 +782,16 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
switch (c->x86) {
case 0x6:
switch (c->x86_model) {
+ case 0 ... 0xC:
+ /* sorry, don't know about them */
+ break;
case 0xD:
case 0xE: /* Pentium M */
ds_configure(&ds_cfg_var);
break;
- case 0xF: /* Core2 */
- case 0x1C: /* Atom */
+ default: /* Core2, Atom, ... */
ds_configure(&ds_cfg_64);
break;
- default:
- /* sorry, don't know about them */
- break;
}
break;
case 0xF:
@@ -858,7 +818,8 @@ void ds_free(struct ds_context *context)
* is dying. There should not be any user of that context left
* to disturb us, anymore. */
unsigned long leftovers = context->count;
- while (leftovers--)
+ while (leftovers--) {
+ put_tracer(context->task);
ds_put_context(context);
+ }
}
-#endif /* CONFIG_X86_DS */
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
new file mode 100644
index 00000000000..6b1f6f6f866
--- /dev/null
+++ b/arch/x86/kernel/dumpstack.c
@@ -0,0 +1,351 @@
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ */
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/utsname.h>
+#include <linux/hardirq.h>
+#include <linux/kdebug.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/kexec.h>
+#include <linux/bug.h>
+#include <linux/nmi.h>
+#include <linux/sysfs.h>
+
+#include <asm/stacktrace.h>
+
+#include "dumpstack.h"
+
+int panic_on_unrecovered_nmi;
+unsigned int code_bytes = 64;
+int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
+static int die_counter;
+
+void printk_address(unsigned long address, int reliable)
+{
+ printk(" [<%p>] %s%pS\n", (void *) address,
+ reliable ? "" : "? ", (void *) address);
+}
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static void
+print_ftrace_graph_addr(unsigned long addr, void *data,
+ const struct stacktrace_ops *ops,
+ struct thread_info *tinfo, int *graph)
+{
+ struct task_struct *task = tinfo->task;
+ unsigned long ret_addr;
+ int index = task->curr_ret_stack;
+
+ if (addr != (unsigned long)return_to_handler)
+ return;
+
+ if (!task->ret_stack || index < *graph)
+ return;
+
+ index -= *graph;
+ ret_addr = task->ret_stack[index].ret;
+
+ ops->address(data, ret_addr, 1);
+
+ (*graph)++;
+}
+#else
+static inline void
+print_ftrace_graph_addr(unsigned long addr, void *data,
+ const struct stacktrace_ops *ops,
+ struct thread_info *tinfo, int *graph)
+{ }
+#endif
+
+/*
+ * x86-64 can have up to three kernel stacks:
+ * process stack
+ * interrupt stack
+ * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
+ */
+
+static inline int valid_stack_ptr(struct thread_info *tinfo,
+ void *p, unsigned int size, void *end)
+{
+ void *t = tinfo;
+ if (end) {
+ if (p < end && p >= (end-THREAD_SIZE))
+ return 1;
+ else
+ return 0;
+ }
+ return p > t && p < t + THREAD_SIZE - size;
+}
+
+unsigned long
+print_context_stack(struct thread_info *tinfo,
+ unsigned long *stack, unsigned long bp,
+ const struct stacktrace_ops *ops, void *data,
+ unsigned long *end, int *graph)
+{
+ struct stack_frame *frame = (struct stack_frame *)bp;
+
+ while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
+ unsigned long addr;
+
+ addr = *stack;
+ if (__kernel_text_address(addr)) {
+ if ((unsigned long) stack == bp + sizeof(long)) {
+ ops->address(data, addr, 1);
+ frame = frame->next_frame;
+ bp = (unsigned long) frame;
+ } else {
+ ops->address(data, addr, bp == 0);
+ }
+ print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
+ }
+ stack++;
+ }
+ return bp;
+}
+
+
+static void
+print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+ printk(data);
+ print_symbol(msg, symbol);
+ printk("\n");
+}
+
+static void print_trace_warning(void *data, char *msg)
+{
+ printk("%s%s\n", (char *)data, msg);
+}
+
+static int print_trace_stack(void *data, char *name)
+{
+ printk("%s <%s> ", (char *)data, name);
+ return 0;
+}
+
+/*
+ * Print one address/symbol entries per line.
+ */
+static void print_trace_address(void *data, unsigned long addr, int reliable)
+{
+ touch_nmi_watchdog();
+ printk(data);
+ printk_address(addr, reliable);
+}
+
+static const struct stacktrace_ops print_trace_ops = {
+ .warning = print_trace_warning,
+ .warning_symbol = print_trace_warning_symbol,
+ .stack = print_trace_stack,
+ .address = print_trace_address,
+};
+
+void
+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, unsigned long bp, char *log_lvl)
+{
+ printk("%sCall Trace:\n", log_lvl);
+ dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
+}
+
+void show_trace(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, unsigned long bp)
+{
+ show_trace_log_lvl(task, regs, stack, bp, "");
+}
+
+void show_stack(struct task_struct *task, unsigned long *sp)
+{
+ show_stack_log_lvl(task, NULL, sp, 0, "");
+}
+
+/*
+ * The architecture-independent dump_stack generator
+ */
+void dump_stack(void)
+{
+ unsigned long bp = 0;
+ unsigned long stack;
+
+#ifdef CONFIG_FRAME_POINTER
+ if (!bp)
+ get_bp(bp);
+#endif
+
+ printk("Pid: %d, comm: %.20s %s %s %.*s\n",
+ current->pid, current->comm, print_tainted(),
+ init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version);
+ show_trace(NULL, NULL, &stack, bp);
+}
+EXPORT_SYMBOL(dump_stack);
+
+static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
+static int die_owner = -1;
+static unsigned int die_nest_count;
+
+unsigned __kprobes long oops_begin(void)
+{
+ int cpu;
+ unsigned long flags;
+
+ oops_enter();
+
+ /* racy, but better than risking deadlock. */
+ raw_local_irq_save(flags);
+ cpu = smp_processor_id();
+ if (!__raw_spin_trylock(&die_lock)) {
+ if (cpu == die_owner)
+ /* nested oops. should stop eventually */;
+ else
+ __raw_spin_lock(&die_lock);
+ }
+ die_nest_count++;
+ die_owner = cpu;
+ console_verbose();
+ bust_spinlocks(1);
+ return flags;
+}
+
+void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
+{
+ if (regs && kexec_should_crash(current))
+ crash_kexec(regs);
+
+ bust_spinlocks(0);
+ die_owner = -1;
+ add_taint(TAINT_DIE);
+ die_nest_count--;
+ if (!die_nest_count)
+ /* Nest count reaches zero, release the lock. */
+ __raw_spin_unlock(&die_lock);
+ raw_local_irq_restore(flags);
+ oops_exit();
+
+ if (!signr)
+ return;
+ if (in_interrupt())
+ panic("Fatal exception in interrupt");
+ if (panic_on_oops)
+ panic("Fatal exception");
+ do_exit(signr);
+}
+
+int __kprobes __die(const char *str, struct pt_regs *regs, long err)
+{
+#ifdef CONFIG_X86_32
+ unsigned short ss;
+ unsigned long sp;
+#endif
+ printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
+#ifdef CONFIG_PREEMPT
+ printk("PREEMPT ");
+#endif
+#ifdef CONFIG_SMP
+ printk("SMP ");
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ printk("DEBUG_PAGEALLOC");
+#endif
+ printk("\n");
+ sysfs_printk_last_file();
+ if (notify_die(DIE_OOPS, str, regs, err,
+ current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
+ return 1;
+
+ show_registers(regs);
+#ifdef CONFIG_X86_32
+ sp = (unsigned long) (&regs->sp);
+ savesegment(ss, ss);
+ if (user_mode(regs)) {
+ sp = regs->sp;
+ ss = regs->ss & 0xffff;
+ }
+ printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
+ print_symbol("%s", regs->ip);
+ printk(" SS:ESP %04x:%08lx\n", ss, sp);
+#else
+ /* Executive summary in case the oops scrolled away */
+ printk(KERN_ALERT "RIP ");
+ printk_address(regs->ip, 1);
+ printk(" RSP <%016lx>\n", regs->sp);
+#endif
+ return 0;
+}
+
+/*
+ * This is gone through when something in the kernel has done something bad
+ * and is about to be terminated:
+ */
+void die(const char *str, struct pt_regs *regs, long err)
+{
+ unsigned long flags = oops_begin();
+ int sig = SIGSEGV;
+
+ if (!user_mode_vm(regs))
+ report_bug(regs->ip, regs);
+
+ if (__die(str, regs, err))
+ sig = 0;
+ oops_end(flags, regs, sig);
+}
+
+void notrace __kprobes
+die_nmi(char *str, struct pt_regs *regs, int do_panic)
+{
+ unsigned long flags;
+
+ if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
+ return;
+
+ /*
+ * We are in trouble anyway, lets at least try
+ * to get a message out.
+ */
+ flags = oops_begin();
+ printk(KERN_EMERG "%s", str);
+ printk(" on CPU%d, ip %08lx, registers:\n",
+ smp_processor_id(), regs->ip);
+ show_registers(regs);
+ oops_end(flags, regs, 0);
+ if (do_panic || panic_on_oops)
+ panic("Non maskable interrupt");
+ nmi_exit();
+ local_irq_enable();
+ do_exit(SIGBUS);
+}
+
+static int __init oops_setup(char *s)
+{
+ if (!s)
+ return -EINVAL;
+ if (!strcmp(s, "panic"))
+ panic_on_oops = 1;
+ return 0;
+}
+early_param("oops", oops_setup);
+
+static int __init kstack_setup(char *s)
+{
+ if (!s)
+ return -EINVAL;
+ kstack_depth_to_print = simple_strtoul(s, NULL, 0);
+ return 0;
+}
+early_param("kstack", kstack_setup);
+
+static int __init code_bytes_setup(char *s)
+{
+ code_bytes = simple_strtoul(s, NULL, 0);
+ if (code_bytes > 8192)
+ code_bytes = 8192;
+
+ return 1;
+}
+__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
new file mode 100644
index 00000000000..da87590b869
--- /dev/null
+++ b/arch/x86/kernel/dumpstack.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ */
+
+#ifndef DUMPSTACK_H
+#define DUMPSTACK_H
+
+#ifdef CONFIG_X86_32
+#define STACKSLOTS_PER_LINE 8
+#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
+#else
+#define STACKSLOTS_PER_LINE 4
+#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
+#endif
+
+extern unsigned long
+print_context_stack(struct thread_info *tinfo,
+ unsigned long *stack, unsigned long bp,
+ const struct stacktrace_ops *ops, void *data,
+ unsigned long *end, int *graph);
+
+extern void
+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, unsigned long bp, char *log_lvl);
+
+extern void
+show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *sp, unsigned long bp, char *log_lvl);
+
+extern unsigned int code_bytes;
+extern int kstack_depth_to_print;
+
+/* The form of the top of the frame on the stack */
+struct stack_frame {
+ struct stack_frame *next_frame;
+ unsigned long return_address;
+};
+#endif
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index b3614752197..d593cd1f58d 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -17,69 +17,14 @@
#include <asm/stacktrace.h>
-#define STACKSLOTS_PER_LINE 8
-#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
-
-int panic_on_unrecovered_nmi;
-int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
-static unsigned int code_bytes = 64;
-static int die_counter;
-
-void printk_address(unsigned long address, int reliable)
-{
- printk(" [<%p>] %s%pS\n", (void *) address,
- reliable ? "" : "? ", (void *) address);
-}
-
-static inline int valid_stack_ptr(struct thread_info *tinfo,
- void *p, unsigned int size, void *end)
-{
- void *t = tinfo;
- if (end) {
- if (p < end && p >= (end-THREAD_SIZE))
- return 1;
- else
- return 0;
- }
- return p > t && p < t + THREAD_SIZE - size;
-}
-
-/* The form of the top of the frame on the stack */
-struct stack_frame {
- struct stack_frame *next_frame;
- unsigned long return_address;
-};
-
-static inline unsigned long
-print_context_stack(struct thread_info *tinfo,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data,
- unsigned long *end)
-{
- struct stack_frame *frame = (struct stack_frame *)bp;
-
- while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
- unsigned long addr;
-
- addr = *stack;
- if (__kernel_text_address(addr)) {
- if ((unsigned long) stack == bp + sizeof(long)) {
- ops->address(data, addr, 1);
- frame = frame->next_frame;
- bp = (unsigned long) frame;
- } else {
- ops->address(data, addr, bp == 0);
- }
- }
- stack++;
- }
- return bp;
-}
+#include "dumpstack.h"
void dump_trace(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, unsigned long bp,
const struct stacktrace_ops *ops, void *data)
{
+ int graph = 0;
+
if (!task)
task = current;
@@ -107,7 +52,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
context = (struct thread_info *)
((unsigned long)stack & (~(THREAD_SIZE - 1)));
- bp = print_context_stack(context, stack, bp, ops, data, NULL);
+ bp = print_context_stack(context, stack, bp, ops,
+ data, NULL, &graph);
stack = (unsigned long *)context->previous_esp;
if (!stack)
@@ -119,57 +65,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
}
EXPORT_SYMBOL(dump_trace);
-static void
-print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
- printk(data);
- print_symbol(msg, symbol);
- printk("\n");
-}
-
-static void print_trace_warning(void *data, char *msg)
-{
- printk("%s%s\n", (char *)data, msg);
-}
-
-static int print_trace_stack(void *data, char *name)
-{
- printk("%s <%s> ", (char *)data, name);
- return 0;
-}
-
-/*
- * Print one address/symbol entries per line.
- */
-static void print_trace_address(void *data, unsigned long addr, int reliable)
-{
- touch_nmi_watchdog();
- printk(data);
- printk_address(addr, reliable);
-}
-
-static const struct stacktrace_ops print_trace_ops = {
- .warning = print_trace_warning,
- .warning_symbol = print_trace_warning_symbol,
- .stack = print_trace_stack,
- .address = print_trace_address,
-};
-
-static void
-show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp, char *log_lvl)
-{
- printk("%sCall Trace:\n", log_lvl);
- dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
-}
-
-void show_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp)
-{
- show_trace_log_lvl(task, regs, stack, bp, "");
-}
-
-static void
+void
show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *sp, unsigned long bp, char *log_lvl)
{
@@ -196,33 +92,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
show_trace_log_lvl(task, regs, sp, bp, log_lvl);
}
-void show_stack(struct task_struct *task, unsigned long *sp)
-{
- show_stack_log_lvl(task, NULL, sp, 0, "");
-}
-
-/*
- * The architecture-independent dump_stack generator
- */
-void dump_stack(void)
-{
- unsigned long bp = 0;
- unsigned long stack;
-
-#ifdef CONFIG_FRAME_POINTER
- if (!bp)
- get_bp(bp);
-#endif
-
- printk("Pid: %d, comm: %.20s %s %s %.*s\n",
- current->pid, current->comm, print_tainted(),
- init_utsname()->release,
- (int)strcspn(init_utsname()->version, " "),
- init_utsname()->version);
- show_trace(NULL, NULL, &stack, bp);
-}
-
-EXPORT_SYMBOL(dump_stack);
void show_registers(struct pt_regs *regs)
{
@@ -283,167 +152,3 @@ int is_valid_bugaddr(unsigned long ip)
return ud2 == 0x0b0f;
}
-static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
-static int die_owner = -1;
-static unsigned int die_nest_count;
-
-unsigned __kprobes long oops_begin(void)
-{
- unsigned long flags;
-
- oops_enter();
-
- if (die_owner != raw_smp_processor_id()) {
- console_verbose();
- raw_local_irq_save(flags);
- __raw_spin_lock(&die_lock);
- die_owner = smp_processor_id();
- die_nest_count = 0;
- bust_spinlocks(1);
- } else {
- raw_local_irq_save(flags);
- }
- die_nest_count++;
- return flags;
-}
-
-void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
-{
- bust_spinlocks(0);
- die_owner = -1;
- add_taint(TAINT_DIE);
- __raw_spin_unlock(&die_lock);
- raw_local_irq_restore(flags);
-
- if (!regs)
- return;
-
- if (kexec_should_crash(current))
- crash_kexec(regs);
- if (in_interrupt())
- panic("Fatal exception in interrupt");
- if (panic_on_oops)
- panic("Fatal exception");
- oops_exit();
- do_exit(signr);
-}
-
-int __kprobes __die(const char *str, struct pt_regs *regs, long err)
-{
- unsigned short ss;
- unsigned long sp;
-
- printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
-#ifdef CONFIG_PREEMPT
- printk("PREEMPT ");
-#endif
-#ifdef CONFIG_SMP
- printk("SMP ");
-#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
- printk("DEBUG_PAGEALLOC");
-#endif
- printk("\n");
- sysfs_printk_last_file();
- if (notify_die(DIE_OOPS, str, regs, err,
- current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
- return 1;
-
- show_registers(regs);
- /* Executive summary in case the oops scrolled away */
- sp = (unsigned long) (&regs->sp);
- savesegment(ss, ss);
- if (user_mode(regs)) {
- sp = regs->sp;
- ss = regs->ss & 0xffff;
- }
- printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
- print_symbol("%s", regs->ip);
- printk(" SS:ESP %04x:%08lx\n", ss, sp);
- return 0;
-}
-
-/*
- * This is gone through when something in the kernel has done something bad
- * and is about to be terminated:
- */
-void die(const char *str, struct pt_regs *regs, long err)
-{
- unsigned long flags = oops_begin();
-
- if (die_nest_count < 3) {
- report_bug(regs->ip, regs);
-
- if (__die(str, regs, err))
- regs = NULL;
- } else {
- printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
- }
-
- oops_end(flags, regs, SIGSEGV);
-}
-
-static DEFINE_SPINLOCK(nmi_print_lock);
-
-void notrace __kprobes
-die_nmi(char *str, struct pt_regs *regs, int do_panic)
-{
- if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
- return;
-
- spin_lock(&nmi_print_lock);
- /*
- * We are in trouble anyway, lets at least try
- * to get a message out:
- */
- bust_spinlocks(1);
- printk(KERN_EMERG "%s", str);
- printk(" on CPU%d, ip %08lx, registers:\n",
- smp_processor_id(), regs->ip);
- show_registers(regs);
- if (do_panic)
- panic("Non maskable interrupt");
- console_silent();
- spin_unlock(&nmi_print_lock);
-
- /*
- * If we are in kernel we are probably nested up pretty bad
- * and might aswell get out now while we still can:
- */
- if (!user_mode_vm(regs)) {
- current->thread.trap_no = 2;
- crash_kexec(regs);
- }
-
- bust_spinlocks(0);
- do_exit(SIGSEGV);
-}
-
-static int __init oops_setup(char *s)
-{
- if (!s)
- return -EINVAL;
- if (!strcmp(s, "panic"))
- panic_on_oops = 1;
- return 0;
-}
-early_param("oops", oops_setup);
-
-static int __init kstack_setup(char *s)
-{
- if (!s)
- return -EINVAL;
- kstack_depth_to_print = simple_strtoul(s, NULL, 0);
- return 0;
-}
-early_param("kstack", kstack_setup);
-
-static int __init code_bytes_setup(char *s)
-{
- code_bytes = simple_strtoul(s, NULL, 0);
- if (code_bytes > 8192)
- code_bytes = 8192;
-
- return 1;
-}
-__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 96a5db7da8a..c302d070704 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -17,19 +17,7 @@
#include <asm/stacktrace.h>
-#define STACKSLOTS_PER_LINE 4
-#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
-
-int panic_on_unrecovered_nmi;
-int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
-static unsigned int code_bytes = 64;
-static int die_counter;
-
-void printk_address(unsigned long address, int reliable)
-{
- printk(" [<%p>] %s%pS\n", (void *) address,
- reliable ? "" : "? ", (void *) address);
-}
+#include "dumpstack.h"
static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
unsigned *usedp, char **idp)
@@ -113,51 +101,6 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
* severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
*/
-static inline int valid_stack_ptr(struct thread_info *tinfo,
- void *p, unsigned int size, void *end)
-{
- void *t = tinfo;
- if (end) {
- if (p < end && p >= (end-THREAD_SIZE))
- return 1;
- else
- return 0;
- }
- return p > t && p < t + THREAD_SIZE - size;
-}
-
-/* The form of the top of the frame on the stack */
-struct stack_frame {
- struct stack_frame *next_frame;
- unsigned long return_address;
-};
-
-static inline unsigned long
-print_context_stack(struct thread_info *tinfo,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data,
- unsigned long *end)
-{
- struct stack_frame *frame = (struct stack_frame *)bp;
-
- while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
- unsigned long addr;
-
- addr = *stack;
- if (__kernel_text_address(addr)) {
- if ((unsigned long) stack == bp + sizeof(long)) {
- ops->address(data, addr, 1);
- frame = frame->next_frame;
- bp = (unsigned long) frame;
- } else {
- ops->address(data, addr, bp == 0);
- }
- }
- stack++;
- }
- return bp;
-}
-
void dump_trace(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, unsigned long bp,
const struct stacktrace_ops *ops, void *data)
@@ -166,6 +109,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
unsigned used = 0;
struct thread_info *tinfo;
+ int graph = 0;
if (!task)
task = current;
@@ -206,7 +150,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
break;
bp = print_context_stack(tinfo, stack, bp, ops,
- data, estack_end);
+ data, estack_end, &graph);
ops->stack(data, "<EOE>");
/*
* We link to the next stack via the
@@ -225,7 +169,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
if (ops->stack(data, "IRQ") < 0)
break;
bp = print_context_stack(tinfo, stack, bp,
- ops, data, irqstack_end);
+ ops, data, irqstack_end, &graph);
/*
* We link to the next stack (which would be
* the process stack normally) the last
@@ -243,62 +187,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
/*
* This handles the process stack:
*/
- bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
+ bp = print_context_stack(tinfo, stack, bp, ops, data, NULL, &graph);
put_cpu();
}
EXPORT_SYMBOL(dump_trace);
-static void
-print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
- printk(data);
- print_symbol(msg, symbol);
- printk("\n");
-}
-
-static void print_trace_warning(void *data, char *msg)
-{
- printk("%s%s\n", (char *)data, msg);
-}
-
-static int print_trace_stack(void *data, char *name)
-{
- printk("%s <%s> ", (char *)data, name);
- return 0;
-}
-
-/*
- * Print one address/symbol entries per line.
- */
-static void print_trace_address(void *data, unsigned long addr, int reliable)
-{
- touch_nmi_watchdog();
- printk(data);
- printk_address(addr, reliable);
-}
-
-static const struct stacktrace_ops print_trace_ops = {
- .warning = print_trace_warning,
- .warning_symbol = print_trace_warning_symbol,
- .stack = print_trace_stack,
- .address = print_trace_address,
-};
-
-static void
-show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp, char *log_lvl)
-{
- printk("%sCall Trace:\n", log_lvl);
- dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
-}
-
-void show_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp)
-{
- show_trace_log_lvl(task, regs, stack, bp, "");
-}
-
-static void
+void
show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *sp, unsigned long bp, char *log_lvl)
{
@@ -342,33 +236,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
show_trace_log_lvl(task, regs, sp, bp, log_lvl);
}
-void show_stack(struct task_struct *task, unsigned long *sp)
-{
- show_stack_log_lvl(task, NULL, sp, 0, "");
-}
-
-/*
- * The architecture-independent dump_stack generator
- */
-void dump_stack(void)
-{
- unsigned long bp = 0;
- unsigned long stack;
-
-#ifdef CONFIG_FRAME_POINTER
- if (!bp)
- get_bp(bp);
-#endif
-
- printk("Pid: %d, comm: %.20s %s %s %.*s\n",
- current->pid, current->comm, print_tainted(),
- init_utsname()->release,
- (int)strcspn(init_utsname()->version, " "),
- init_utsname()->version);
- show_trace(NULL, NULL, &stack, bp);
-}
-EXPORT_SYMBOL(dump_stack);
-
void show_registers(struct pt_regs *regs)
{
int i;
@@ -429,147 +296,3 @@ int is_valid_bugaddr(unsigned long ip)
return ud2 == 0x0b0f;
}
-static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
-static int die_owner = -1;
-static unsigned int die_nest_count;
-
-unsigned __kprobes long oops_begin(void)
-{
- int cpu;
- unsigned long flags;
-
- oops_enter();
-
- /* racy, but better than risking deadlock. */
- raw_local_irq_save(flags);
- cpu = smp_processor_id();
- if (!__raw_spin_trylock(&die_lock)) {
- if (cpu == die_owner)
- /* nested oops. should stop eventually */;
- else
- __raw_spin_lock(&die_lock);
- }
- die_nest_count++;
- die_owner = cpu;
- console_verbose();
- bust_spinlocks(1);
- return flags;
-}
-
-void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
-{
- die_owner = -1;
- bust_spinlocks(0);
- die_nest_count--;
- if (!die_nest_count)
- /* Nest count reaches zero, release the lock. */
- __raw_spin_unlock(&die_lock);
- raw_local_irq_restore(flags);
- if (!regs) {
- oops_exit();
- return;
- }
- if (in_interrupt())
- panic("Fatal exception in interrupt");
- if (panic_on_oops)
- panic("Fatal exception");
- oops_exit();
- do_exit(signr);
-}
-
-int __kprobes __die(const char *str, struct pt_regs *regs, long err)
-{
- printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
-#ifdef CONFIG_PREEMPT
- printk("PREEMPT ");
-#endif
-#ifdef CONFIG_SMP
- printk("SMP ");
-#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
- printk("DEBUG_PAGEALLOC");
-#endif
- printk("\n");
- sysfs_printk_last_file();
- if (notify_die(DIE_OOPS, str, regs, err,
- current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
- return 1;
-
- show_registers(regs);
- add_taint(TAINT_DIE);
- /* Executive summary in case the oops scrolled away */
- printk(KERN_ALERT "RIP ");
- printk_address(regs->ip, 1);
- printk(" RSP <%016lx>\n", regs->sp);
- if (kexec_should_crash(current))
- crash_kexec(regs);
- return 0;
-}
-
-void die(const char *str, struct pt_regs *regs, long err)
-{
- unsigned long flags = oops_begin();
-
- if (!user_mode(regs))
- report_bug(regs->ip, regs);
-
- if (__die(str, regs, err))
- regs = NULL;
- oops_end(flags, regs, SIGSEGV);
-}
-
-notrace __kprobes void
-die_nmi(char *str, struct pt_regs *regs, int do_panic)
-{
- unsigned long flags;
-
- if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
- return;
-
- flags = oops_begin();
- /*
- * We are in trouble anyway, lets at least try
- * to get a message out.
- */
- printk(KERN_EMERG "%s", str);
- printk(" on CPU%d, ip %08lx, registers:\n",
- smp_processor_id(), regs->ip);
- show_registers(regs);
- if (kexec_should_crash(current))
- crash_kexec(regs);
- if (do_panic || panic_on_oops)
- panic("Non maskable interrupt");
- oops_end(flags, NULL, SIGBUS);
- nmi_exit();
- local_irq_enable();
- do_exit(SIGBUS);
-}
-
-static int __init oops_setup(char *s)
-{
- if (!s)
- return -EINVAL;
- if (!strcmp(s, "panic"))
- panic_on_oops = 1;
- return 0;
-}
-early_param("oops", oops_setup);
-
-static int __init kstack_setup(char *s)
-{
- if (!s)
- return -EINVAL;
- kstack_depth_to_print = simple_strtoul(s, NULL, 0);
- return 0;
-}
-early_param("kstack", kstack_setup);
-
-static int __init code_bytes_setup(char *s)
-{
- code_bytes = simple_strtoul(s, NULL, 0);
- if (code_bytes > 8192)
- code_bytes = 8192;
-
- return 1;
-}
-__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 3ce029ffaa5..1b894b72c0f 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -188,20 +188,6 @@ static void __init ati_bugs_contd(int num, int slot, int func)
}
#endif
-#ifdef CONFIG_DMAR
-static void __init intel_g33_dmar(int num, int slot, int func)
-{
- struct acpi_table_header *dmar_tbl;
- acpi_status status;
-
- status = acpi_get_table(ACPI_SIG_DMAR, 0, &dmar_tbl);
- if (ACPI_SUCCESS(status)) {
- printk(KERN_INFO "BIOS BUG: DMAR advertised on Intel G31/G33 chipset -- ignoring\n");
- dmar_disabled = 1;
- }
-}
-#endif
-
#define QFLAG_APPLY_ONCE 0x1
#define QFLAG_APPLIED 0x2
#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -225,10 +211,6 @@ static struct chipset early_qrk[] __initdata = {
PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs },
{ PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd },
-#ifdef CONFIG_DMAR
- { PCI_VENDOR_ID_INTEL, 0x29c0,
- PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, intel_g33_dmar },
-#endif
{}
};
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 28b597ef9ca..43ceb3f454b 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1157,6 +1157,9 @@ ENTRY(mcount)
END(mcount)
ENTRY(ftrace_caller)
+ cmpl $0, function_trace_stop
+ jne ftrace_stub
+
pushl %eax
pushl %ecx
pushl %edx
@@ -1171,6 +1174,11 @@ ftrace_call:
popl %edx
popl %ecx
popl %eax
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+ jmp ftrace_stub
+#endif
.globl ftrace_stub
ftrace_stub:
@@ -1180,8 +1188,18 @@ END(ftrace_caller)
#else /* ! CONFIG_DYNAMIC_FTRACE */
ENTRY(mcount)
+ cmpl $0, function_trace_stop
+ jne ftrace_stub
+
cmpl $ftrace_stub, ftrace_trace_function
jnz trace
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ cmpl $ftrace_stub, ftrace_graph_return
+ jnz ftrace_graph_caller
+
+ cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
+ jnz ftrace_graph_caller
+#endif
.globl ftrace_stub
ftrace_stub:
ret
@@ -1200,12 +1218,43 @@ trace:
popl %edx
popl %ecx
popl %eax
-
jmp ftrace_stub
END(mcount)
#endif /* CONFIG_DYNAMIC_FTRACE */
#endif /* CONFIG_FUNCTION_TRACER */
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+ cmpl $0, function_trace_stop
+ jne ftrace_stub
+
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ movl 0xc(%esp), %edx
+ lea 0x4(%ebp), %eax
+ subl $MCOUNT_INSN_SIZE, %edx
+ call prepare_ftrace_return
+ popl %edx
+ popl %ecx
+ popl %eax
+ ret
+END(ftrace_graph_caller)
+
+.globl return_to_handler
+return_to_handler:
+ pushl $0
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ call ftrace_return_to_handler
+ movl %eax, 0xc(%esp)
+ popl %edx
+ popl %ecx
+ popl %eax
+ ret
+#endif
+
.section .rodata,"a"
#include "syscall_table_32.S"
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b86f332c96a..54e0bbdccb9 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -68,6 +68,8 @@ ENTRY(mcount)
END(mcount)
ENTRY(ftrace_caller)
+ cmpl $0, function_trace_stop
+ jne ftrace_stub
/* taken from glibc */
subq $0x38, %rsp
@@ -96,6 +98,12 @@ ftrace_call:
movq (%rsp), %rax
addq $0x38, %rsp
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+ jmp ftrace_stub
+#endif
+
.globl ftrace_stub
ftrace_stub:
retq
@@ -103,8 +111,20 @@ END(ftrace_caller)
#else /* ! CONFIG_DYNAMIC_FTRACE */
ENTRY(mcount)
+ cmpl $0, function_trace_stop
+ jne ftrace_stub
+
cmpq $ftrace_stub, ftrace_trace_function
jnz trace
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ cmpq $ftrace_stub, ftrace_graph_return
+ jnz ftrace_graph_caller
+
+ cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
+ jnz ftrace_graph_caller
+#endif
+
.globl ftrace_stub
ftrace_stub:
retq
@@ -140,6 +160,69 @@ END(mcount)
#endif /* CONFIG_DYNAMIC_FTRACE */
#endif /* CONFIG_FUNCTION_TRACER */
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+ cmpl $0, function_trace_stop
+ jne ftrace_stub
+
+ subq $0x38, %rsp
+ movq %rax, (%rsp)
+ movq %rcx, 8(%rsp)
+ movq %rdx, 16(%rsp)
+ movq %rsi, 24(%rsp)
+ movq %rdi, 32(%rsp)
+ movq %r8, 40(%rsp)
+ movq %r9, 48(%rsp)
+
+ leaq 8(%rbp), %rdi
+ movq 0x38(%rsp), %rsi
+ subq $MCOUNT_INSN_SIZE, %rsi
+
+ call prepare_ftrace_return
+
+ movq 48(%rsp), %r9
+ movq 40(%rsp), %r8
+ movq 32(%rsp), %rdi
+ movq 24(%rsp), %rsi
+ movq 16(%rsp), %rdx
+ movq 8(%rsp), %rcx
+ movq (%rsp), %rax
+ addq $0x38, %rsp
+ retq
+END(ftrace_graph_caller)
+
+
+.globl return_to_handler
+return_to_handler:
+ subq $80, %rsp
+
+ movq %rax, (%rsp)
+ movq %rcx, 8(%rsp)
+ movq %rdx, 16(%rsp)
+ movq %rsi, 24(%rsp)
+ movq %rdi, 32(%rsp)
+ movq %r8, 40(%rsp)
+ movq %r9, 48(%rsp)
+ movq %r10, 56(%rsp)
+ movq %r11, 64(%rsp)
+
+ call ftrace_return_to_handler
+
+ movq %rax, 72(%rsp)
+ movq 64(%rsp), %r11
+ movq 56(%rsp), %r10
+ movq 48(%rsp), %r9
+ movq 40(%rsp), %r8
+ movq 32(%rsp), %rdi
+ movq 24(%rsp), %rsi
+ movq 16(%rsp), %rdx
+ movq 8(%rsp), %rcx
+ movq (%rsp), %rax
+ addq $72, %rsp
+ retq
+#endif
+
+
#ifndef CONFIG_PREEMPT
#define retint_kernel retint_restore_args
#endif
diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c
index f454c78fcef..53699c931ad 100644
--- a/arch/x86/kernel/es7000_32.c
+++ b/arch/x86/kernel/es7000_32.c
@@ -38,8 +38,11 @@
#include <asm/io.h>
#include <asm/nmi.h>
#include <asm/smp.h>
+#include <asm/atomic.h>
#include <asm/apicdef.h>
#include <mach_mpparse.h>
+#include <asm/genapic.h>
+#include <asm/setup.h>
/*
* ES7000 chipsets
@@ -161,6 +164,43 @@ es7000_rename_gsi(int ioapic, int gsi)
return gsi;
}
+static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
+{
+ unsigned long vect = 0, psaival = 0;
+
+ if (psai == NULL)
+ return -1;
+
+ vect = ((unsigned long)__pa(eip)/0x1000) << 16;
+ psaival = (0x1000000 | vect | cpu);
+
+ while (*psai & 0x1000000)
+ ;
+
+ *psai = psaival;
+
+ return 0;
+}
+
+static void noop_wait_for_deassert(atomic_t *deassert_not_used)
+{
+}
+
+static int __init es7000_update_genapic(void)
+{
+ genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip;
+
+ /* MPENTIUMIII */
+ if (boot_cpu_data.x86 == 6 &&
+ (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11)) {
+ es7000_update_genapic_to_cluster();
+ genapic->wait_for_init_deassert = noop_wait_for_deassert;
+ genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip;
+ }
+
+ return 0;
+}
+
void __init
setup_unisys(void)
{
@@ -176,6 +216,8 @@ setup_unisys(void)
else
es7000_plat = ES7000_CLASSIC;
ioapic_renumber_irq = es7000_rename_gsi;
+
+ x86_quirks->update_genapic = es7000_update_genapic;
}
/*
@@ -250,31 +292,24 @@ int __init find_unisys_acpi_oem_table(unsigned long *oem_addr)
{
struct acpi_table_header *header = NULL;
int i = 0;
- acpi_size tbl_size;
- while (ACPI_SUCCESS(acpi_get_table_with_size("OEM1", i++, &header, &tbl_size))) {
+ while (ACPI_SUCCESS(acpi_get_table("OEM1", i++, &header))) {
if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) {
struct oem_table *t = (struct oem_table *)header;
oem_addrX = t->OEMTableAddr;
oem_size = t->OEMTableSize;
- early_acpi_os_unmap_memory(header, tbl_size);
*oem_addr = (unsigned long)__acpi_map_table(oem_addrX,
oem_size);
return 0;
}
- early_acpi_os_unmap_memory(header, tbl_size);
}
return -1;
}
void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr)
{
- if (!oem_addr)
- return;
-
- __acpi_unmap_table((char *)oem_addr, oem_size);
}
#endif
@@ -324,26 +359,6 @@ es7000_mip_write(struct mip_reg *mip_reg)
return status;
}
-int
-es7000_start_cpu(int cpu, unsigned long eip)
-{
- unsigned long vect = 0, psaival = 0;
-
- if (psai == NULL)
- return -1;
-
- vect = ((unsigned long)__pa(eip)/0x1000) << 16;
- psaival = (0x1000000 | vect | cpu);
-
- while (*psai & 0x1000000)
- ;
-
- *psai = psaival;
-
- return 0;
-
-}
-
void __init
es7000_sw_apic(void)
{
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 50ea0ac8c9b..1b43086b097 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -14,14 +14,17 @@
#include <linux/uaccess.h>
#include <linux/ftrace.h>
#include <linux/percpu.h>
+#include <linux/sched.h>
#include <linux/init.h>
#include <linux/list.h>
#include <asm/ftrace.h>
+#include <linux/ftrace.h>
#include <asm/nops.h>
+#include <asm/nmi.h>
-static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
+#ifdef CONFIG_DYNAMIC_FTRACE
union ftrace_code_union {
char code[MCOUNT_INSN_SIZE];
@@ -31,18 +34,12 @@ union ftrace_code_union {
} __attribute__((packed));
};
-
static int ftrace_calc_offset(long ip, long addr)
{
return (int)(addr - ip);
}
-unsigned char *ftrace_nop_replace(void)
-{
- return ftrace_nop;
-}
-
-unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
+static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
{
static union ftrace_code_union calc;
@@ -56,7 +53,142 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
return calc.code;
}
-int
+/*
+ * Modifying code must take extra care. On an SMP machine, if
+ * the code being modified is also being executed on another CPU
+ * that CPU will have undefined results and possibly take a GPF.
+ * We use kstop_machine to stop other CPUS from exectuing code.
+ * But this does not stop NMIs from happening. We still need
+ * to protect against that. We separate out the modification of
+ * the code to take care of this.
+ *
+ * Two buffers are added: An IP buffer and a "code" buffer.
+ *
+ * 1) Put the instruction pointer into the IP buffer
+ * and the new code into the "code" buffer.
+ * 2) Set a flag that says we are modifying code
+ * 3) Wait for any running NMIs to finish.
+ * 4) Write the code
+ * 5) clear the flag.
+ * 6) Wait for any running NMIs to finish.
+ *
+ * If an NMI is executed, the first thing it does is to call
+ * "ftrace_nmi_enter". This will check if the flag is set to write
+ * and if it is, it will write what is in the IP and "code" buffers.
+ *
+ * The trick is, it does not matter if everyone is writing the same
+ * content to the code location. Also, if a CPU is executing code
+ * it is OK to write to that code location if the contents being written
+ * are the same as what exists.
+ */
+
+static atomic_t in_nmi = ATOMIC_INIT(0);
+static int mod_code_status; /* holds return value of text write */
+static int mod_code_write; /* set when NMI should do the write */
+static void *mod_code_ip; /* holds the IP to write to */
+static void *mod_code_newcode; /* holds the text to write to the IP */
+
+static unsigned nmi_wait_count;
+static atomic_t nmi_update_count = ATOMIC_INIT(0);
+
+int ftrace_arch_read_dyn_info(char *buf, int size)
+{
+ int r;
+
+ r = snprintf(buf, size, "%u %u",
+ nmi_wait_count,
+ atomic_read(&nmi_update_count));
+ return r;
+}
+
+static void ftrace_mod_code(void)
+{
+ /*
+ * Yes, more than one CPU process can be writing to mod_code_status.
+ * (and the code itself)
+ * But if one were to fail, then they all should, and if one were
+ * to succeed, then they all should.
+ */
+ mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
+ MCOUNT_INSN_SIZE);
+}
+
+void ftrace_nmi_enter(void)
+{
+ atomic_inc(&in_nmi);
+ /* Must have in_nmi seen before reading write flag */
+ smp_mb();
+ if (mod_code_write) {
+ ftrace_mod_code();
+ atomic_inc(&nmi_update_count);
+ }
+}
+
+void ftrace_nmi_exit(void)
+{
+ /* Finish all executions before clearing in_nmi */
+ smp_wmb();
+ atomic_dec(&in_nmi);
+}
+
+static void wait_for_nmi(void)
+{
+ int waited = 0;
+
+ while (atomic_read(&in_nmi)) {
+ waited = 1;
+ cpu_relax();
+ }
+
+ if (waited)
+ nmi_wait_count++;
+}
+
+static int
+do_ftrace_mod_code(unsigned long ip, void *new_code)
+{
+ mod_code_ip = (void *)ip;
+ mod_code_newcode = new_code;
+
+ /* The buffers need to be visible before we let NMIs write them */
+ smp_wmb();
+
+ mod_code_write = 1;
+
+ /* Make sure write bit is visible before we wait on NMIs */
+ smp_mb();
+
+ wait_for_nmi();
+
+ /* Make sure all running NMIs have finished before we write the code */
+ smp_mb();
+
+ ftrace_mod_code();
+
+ /* Make sure the write happens before clearing the bit */
+ smp_wmb();
+
+ mod_code_write = 0;
+
+ /* make sure NMIs see the cleared bit */
+ smp_mb();
+
+ wait_for_nmi();
+
+ return mod_code_status;
+}
+
+
+
+
+static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
+
+static unsigned char *ftrace_nop_replace(void)
+{
+ return ftrace_nop;
+}
+
+static int
ftrace_modify_code(unsigned long ip, unsigned char *old_code,
unsigned char *new_code)
{
@@ -81,7 +213,7 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
return -EINVAL;
/* replace the text with the new text */
- if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE))
+ if (do_ftrace_mod_code(ip, new_code))
return -EPERM;
sync_core();
@@ -89,6 +221,29 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
return 0;
}
+int ftrace_make_nop(struct module *mod,
+ struct dyn_ftrace *rec, unsigned long addr)
+{
+ unsigned char *new, *old;
+ unsigned long ip = rec->ip;
+
+ old = ftrace_call_replace(ip, addr);
+ new = ftrace_nop_replace();
+
+ return ftrace_modify_code(rec->ip, old, new);
+}
+
+int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+ unsigned char *new, *old;
+ unsigned long ip = rec->ip;
+
+ old = ftrace_nop_replace();
+ new = ftrace_call_replace(ip, addr);
+
+ return ftrace_modify_code(rec->ip, old, new);
+}
+
int ftrace_update_ftrace_func(ftrace_func_t func)
{
unsigned long ip = (unsigned long)(&ftrace_call);
@@ -165,3 +320,218 @@ int __init ftrace_dyn_arch_init(void *data)
return 0;
}
+#endif
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+extern void ftrace_graph_call(void);
+
+static int ftrace_mod_jmp(unsigned long ip,
+ int old_offset, int new_offset)
+{
+ unsigned char code[MCOUNT_INSN_SIZE];
+
+ if (probe_kernel_read(code, (void *)ip, MCOUNT_INSN_SIZE))
+ return -EFAULT;
+
+ if (code[0] != 0xe9 || old_offset != *(int *)(&code[1]))
+ return -EINVAL;
+
+ *(int *)(&code[1]) = new_offset;
+
+ if (do_ftrace_mod_code(ip, &code))
+ return -EPERM;
+
+ return 0;
+}
+
+int ftrace_enable_ftrace_graph_caller(void)
+{
+ unsigned long ip = (unsigned long)(&ftrace_graph_call);
+ int old_offset, new_offset;
+
+ old_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
+ new_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
+
+ return ftrace_mod_jmp(ip, old_offset, new_offset);
+}
+
+int ftrace_disable_ftrace_graph_caller(void)
+{
+ unsigned long ip = (unsigned long)(&ftrace_graph_call);
+ int old_offset, new_offset;
+
+ old_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
+ new_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
+
+ return ftrace_mod_jmp(ip, old_offset, new_offset);
+}
+
+#else /* CONFIG_DYNAMIC_FTRACE */
+
+/*
+ * These functions are picked from those used on
+ * this page for dynamic ftrace. They have been
+ * simplified to ignore all traces in NMI context.
+ */
+static atomic_t in_nmi;
+
+void ftrace_nmi_enter(void)
+{
+ atomic_inc(&in_nmi);
+}
+
+void ftrace_nmi_exit(void)
+{
+ atomic_dec(&in_nmi);
+}
+
+#endif /* !CONFIG_DYNAMIC_FTRACE */
+
+/* Add a function return address to the trace stack on thread info.*/
+static int push_return_trace(unsigned long ret, unsigned long long time,
+ unsigned long func, int *depth)
+{
+ int index;
+
+ if (!current->ret_stack)
+ return -EBUSY;
+
+ /* The return trace stack is full */
+ if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
+ atomic_inc(&current->trace_overrun);
+ return -EBUSY;
+ }
+
+ index = ++current->curr_ret_stack;
+ barrier();
+ current->ret_stack[index].ret = ret;
+ current->ret_stack[index].func = func;
+ current->ret_stack[index].calltime = time;
+ *depth = index;
+
+ return 0;
+}
+
+/* Retrieve a function return address to the trace stack on thread info.*/
+static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
+{
+ int index;
+
+ index = current->curr_ret_stack;
+
+ if (unlikely(index < 0)) {
+ ftrace_graph_stop();
+ WARN_ON(1);
+ /* Might as well panic, otherwise we have no where to go */
+ *ret = (unsigned long)panic;
+ return;
+ }
+
+ *ret = current->ret_stack[index].ret;
+ trace->func = current->ret_stack[index].func;
+ trace->calltime = current->ret_stack[index].calltime;
+ trace->overrun = atomic_read(&current->trace_overrun);
+ trace->depth = index;
+ barrier();
+ current->curr_ret_stack--;
+
+}
+
+/*
+ * Send the trace to the ring-buffer.
+ * @return the original return address.
+ */
+unsigned long ftrace_return_to_handler(void)
+{
+ struct ftrace_graph_ret trace;
+ unsigned long ret;
+
+ pop_return_trace(&trace, &ret);
+ trace.rettime = cpu_clock(raw_smp_processor_id());
+ ftrace_graph_return(&trace);
+
+ if (unlikely(!ret)) {
+ ftrace_graph_stop();
+ WARN_ON(1);
+ /* Might as well panic. What else to do? */
+ ret = (unsigned long)panic;
+ }
+
+ return ret;
+}
+
+/*
+ * Hook the return address and push it in the stack of return addrs
+ * in current thread info.
+ */
+void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
+{
+ unsigned long old;
+ unsigned long long calltime;
+ int faulted;
+ struct ftrace_graph_ent trace;
+ unsigned long return_hooker = (unsigned long)
+ &return_to_handler;
+
+ /* Nmi's are currently unsupported */
+ if (unlikely(atomic_read(&in_nmi)))
+ return;
+
+ if (unlikely(atomic_read(&current->tracing_graph_pause)))
+ return;
+
+ /*
+ * Protect against fault, even if it shouldn't
+ * happen. This tool is too much intrusive to
+ * ignore such a protection.
+ */
+ asm volatile(
+ "1: " _ASM_MOV " (%[parent_old]), %[old]\n"
+ "2: " _ASM_MOV " %[return_hooker], (%[parent_replaced])\n"
+ " movl $0, %[faulted]\n"
+
+ ".section .fixup, \"ax\"\n"
+ "3: movl $1, %[faulted]\n"
+ ".previous\n"
+
+ _ASM_EXTABLE(1b, 3b)
+ _ASM_EXTABLE(2b, 3b)
+
+ : [parent_replaced] "=r" (parent), [old] "=r" (old),
+ [faulted] "=r" (faulted)
+ : [parent_old] "0" (parent), [return_hooker] "r" (return_hooker)
+ : "memory"
+ );
+
+ if (unlikely(faulted)) {
+ ftrace_graph_stop();
+ WARN_ON(1);
+ return;
+ }
+
+ if (unlikely(!__kernel_text_address(old))) {
+ ftrace_graph_stop();
+ *parent = old;
+ WARN_ON(1);
+ return;
+ }
+
+ calltime = cpu_clock(raw_smp_processor_id());
+
+ if (push_return_trace(old, calltime,
+ self_addr, &trace.depth) == -EBUSY) {
+ *parent = old;
+ return;
+ }
+
+ trace.func = self_addr;
+
+ /* Only trace if the calling function expects to */
+ if (!ftrace_graph_entry(&trace)) {
+ current->curr_ret_stack--;
+ *parent = old;
+ }
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index 6c9bfc9e1e9..2bced78b0b8 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -21,6 +21,7 @@
#include <asm/smp.h>
#include <asm/ipi.h>
#include <asm/genapic.h>
+#include <asm/setup.h>
extern struct genapic apic_flat;
extern struct genapic apic_physflat;
@@ -53,6 +54,9 @@ void __init setup_apic_routing(void)
genapic = &apic_physflat;
printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
}
+
+ if (x86_quirks->update_genapic)
+ x86_quirks->update_genapic();
}
/* Same for both flat and physical. */
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c
index c0262791bda..7fa5f49c2dd 100644
--- a/arch/x86/kernel/genapic_flat_64.c
+++ b/arch/x86/kernel/genapic_flat_64.c
@@ -30,12 +30,12 @@ static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
return 1;
}
-static cpumask_t flat_target_cpus(void)
+static const struct cpumask *flat_target_cpus(void)
{
- return cpu_online_map;
+ return cpu_online_mask;
}
-static cpumask_t flat_vector_allocation_domain(int cpu)
+static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
{
/* Careful. Some cpus do not strictly honor the set of cpus
* specified in the interrupt destination when using lowest
@@ -45,8 +45,8 @@ static cpumask_t flat_vector_allocation_domain(int cpu)
* deliver interrupts to the wrong hyperthread when only one
* hyperthread was specified in the interrupt desitination.
*/
- cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
- return domain;
+ cpumask_clear(retmask);
+ cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
}
/*
@@ -69,9 +69,8 @@ static void flat_init_apic_ldr(void)
apic_write(APIC_LDR, val);
}
-static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
+static inline void _flat_send_IPI_mask(unsigned long mask, int vector)
{
- unsigned long mask = cpus_addr(cpumask)[0];
unsigned long flags;
local_irq_save(flags);
@@ -79,20 +78,41 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
local_irq_restore(flags);
}
+static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector)
+{
+ unsigned long mask = cpumask_bits(cpumask)[0];
+
+ _flat_send_IPI_mask(mask, vector);
+}
+
+static void flat_send_IPI_mask_allbutself(const struct cpumask *cpumask,
+ int vector)
+{
+ unsigned long mask = cpumask_bits(cpumask)[0];
+ int cpu = smp_processor_id();
+
+ if (cpu < BITS_PER_LONG)
+ clear_bit(cpu, &mask);
+ _flat_send_IPI_mask(mask, vector);
+}
+
static void flat_send_IPI_allbutself(int vector)
{
+ int cpu = smp_processor_id();
#ifdef CONFIG_HOTPLUG_CPU
int hotplug = 1;
#else
int hotplug = 0;
#endif
if (hotplug || vector == NMI_VECTOR) {
- cpumask_t allbutme = cpu_online_map;
+ if (!cpumask_equal(cpu_online_mask, cpumask_of(cpu))) {
+ unsigned long mask = cpumask_bits(cpu_online_mask)[0];
- cpu_clear(smp_processor_id(), allbutme);
+ if (cpu < BITS_PER_LONG)
+ clear_bit(cpu, &mask);
- if (!cpus_empty(allbutme))
- flat_send_IPI_mask(allbutme, vector);
+ _flat_send_IPI_mask(mask, vector);
+ }
} else if (num_online_cpus() > 1) {
__send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL);
}
@@ -101,7 +121,7 @@ static void flat_send_IPI_allbutself(int vector)
static void flat_send_IPI_all(int vector)
{
if (vector == NMI_VECTOR)
- flat_send_IPI_mask(cpu_online_map, vector);
+ flat_send_IPI_mask(cpu_online_mask, vector);
else
__send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
}
@@ -135,9 +155,18 @@ static int flat_apic_id_registered(void)
return physid_isset(read_xapic_id(), phys_cpu_present_map);
}
-static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask)
+static unsigned int flat_cpu_mask_to_apicid(const struct cpumask *cpumask)
+{
+ return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
+}
+
+static unsigned int flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+ const struct cpumask *andmask)
{
- return cpus_addr(cpumask)[0] & APIC_ALL_CPUS;
+ unsigned long mask1 = cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
+ unsigned long mask2 = cpumask_bits(andmask)[0] & APIC_ALL_CPUS;
+
+ return mask1 & mask2;
}
static unsigned int phys_pkg_id(int index_msb)
@@ -157,8 +186,10 @@ struct genapic apic_flat = {
.send_IPI_all = flat_send_IPI_all,
.send_IPI_allbutself = flat_send_IPI_allbutself,
.send_IPI_mask = flat_send_IPI_mask,
+ .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself,
.send_IPI_self = apic_send_IPI_self,
.cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
.phys_pkg_id = phys_pkg_id,
.get_apic_id = get_apic_id,
.set_apic_id = set_apic_id,
@@ -188,35 +219,39 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
return 0;
}
-static cpumask_t physflat_target_cpus(void)
+static const struct cpumask *physflat_target_cpus(void)
{
- return cpu_online_map;
+ return cpu_online_mask;
}
-static cpumask_t physflat_vector_allocation_domain(int cpu)
+static void physflat_vector_allocation_domain(int cpu, struct cpumask *retmask)
{
- return cpumask_of_cpu(cpu);
+ cpumask_clear(retmask);
+ cpumask_set_cpu(cpu, retmask);
}
-static void physflat_send_IPI_mask(cpumask_t cpumask, int vector)
+static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector)
{
send_IPI_mask_sequence(cpumask, vector);
}
-static void physflat_send_IPI_allbutself(int vector)
+static void physflat_send_IPI_mask_allbutself(const struct cpumask *cpumask,
+ int vector)
{
- cpumask_t allbutme = cpu_online_map;
+ send_IPI_mask_allbutself(cpumask, vector);
+}
- cpu_clear(smp_processor_id(), allbutme);
- physflat_send_IPI_mask(allbutme, vector);
+static void physflat_send_IPI_allbutself(int vector)
+{
+ send_IPI_mask_allbutself(cpu_online_mask, vector);
}
static void physflat_send_IPI_all(int vector)
{
- physflat_send_IPI_mask(cpu_online_map, vector);
+ physflat_send_IPI_mask(cpu_online_mask, vector);
}
-static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
+static unsigned int physflat_cpu_mask_to_apicid(const struct cpumask *cpumask)
{
int cpu;
@@ -224,13 +259,29 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
* We're using fixed IRQ delivery, can only return one phys APIC ID.
* May as well be the first.
*/
- cpu = first_cpu(cpumask);
+ cpu = cpumask_first(cpumask);
if ((unsigned)cpu < nr_cpu_ids)
return per_cpu(x86_cpu_to_apicid, cpu);
else
return BAD_APICID;
}
+static unsigned int
+physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+ const struct cpumask *andmask)
+{
+ int cpu;
+
+ /*
+ * We're using fixed IRQ delivery, can only return one phys APIC ID.
+ * May as well be the first.
+ */
+ cpu = cpumask_any_and(cpumask, andmask);
+ if (cpu < nr_cpu_ids)
+ return per_cpu(x86_cpu_to_apicid, cpu);
+ return BAD_APICID;
+}
+
struct genapic apic_physflat = {
.name = "physical flat",
.acpi_madt_oem_check = physflat_acpi_madt_oem_check,
@@ -243,8 +294,10 @@ struct genapic apic_physflat = {
.send_IPI_all = physflat_send_IPI_all,
.send_IPI_allbutself = physflat_send_IPI_allbutself,
.send_IPI_mask = physflat_send_IPI_mask,
+ .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself,
.send_IPI_self = apic_send_IPI_self,
.cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = physflat_cpu_mask_to_apicid_and,
.phys_pkg_id = phys_pkg_id,
.get_apic_id = get_apic_id,
.set_apic_id = set_apic_id,
diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c
index f6a2c8eb48a..4716a0c9f93 100644
--- a/arch/x86/kernel/genx2apic_cluster.c
+++ b/arch/x86/kernel/genx2apic_cluster.c
@@ -22,19 +22,18 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
-static cpumask_t x2apic_target_cpus(void)
+static const struct cpumask *x2apic_target_cpus(void)
{
- return cpumask_of_cpu(0);
+ return cpumask_of(0);
}
/*
* for now each logical cpu is in its own vector allocation domain.
*/
-static cpumask_t x2apic_vector_allocation_domain(int cpu)
+static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
{
- cpumask_t domain = CPU_MASK_NONE;
- cpu_set(cpu, domain);
- return domain;
+ cpumask_clear(retmask);
+ cpumask_set_cpu(cpu, retmask);
}
static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
@@ -56,32 +55,53 @@ static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
* at once. We have 16 cpu's in a cluster. This will minimize IPI register
* writes.
*/
-static void x2apic_send_IPI_mask(cpumask_t mask, int vector)
+static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
{
unsigned long flags;
unsigned long query_cpu;
local_irq_save(flags);
- for_each_cpu_mask(query_cpu, mask) {
- __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_logical_apicid, query_cpu),
- vector, APIC_DEST_LOGICAL);
- }
+ for_each_cpu(query_cpu, mask)
+ __x2apic_send_IPI_dest(
+ per_cpu(x86_cpu_to_logical_apicid, query_cpu),
+ vector, APIC_DEST_LOGICAL);
local_irq_restore(flags);
}
-static void x2apic_send_IPI_allbutself(int vector)
+static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask,
+ int vector)
{
- cpumask_t mask = cpu_online_map;
+ unsigned long flags;
+ unsigned long query_cpu;
+ unsigned long this_cpu = smp_processor_id();
- cpu_clear(smp_processor_id(), mask);
+ local_irq_save(flags);
+ for_each_cpu(query_cpu, mask)
+ if (query_cpu != this_cpu)
+ __x2apic_send_IPI_dest(
+ per_cpu(x86_cpu_to_logical_apicid, query_cpu),
+ vector, APIC_DEST_LOGICAL);
+ local_irq_restore(flags);
+}
+
+static void x2apic_send_IPI_allbutself(int vector)
+{
+ unsigned long flags;
+ unsigned long query_cpu;
+ unsigned long this_cpu = smp_processor_id();
- if (!cpus_empty(mask))
- x2apic_send_IPI_mask(mask, vector);
+ local_irq_save(flags);
+ for_each_online_cpu(query_cpu)
+ if (query_cpu != this_cpu)
+ __x2apic_send_IPI_dest(
+ per_cpu(x86_cpu_to_logical_apicid, query_cpu),
+ vector, APIC_DEST_LOGICAL);
+ local_irq_restore(flags);
}
static void x2apic_send_IPI_all(int vector)
{
- x2apic_send_IPI_mask(cpu_online_map, vector);
+ x2apic_send_IPI_mask(cpu_online_mask, vector);
}
static int x2apic_apic_id_registered(void)
@@ -89,7 +109,7 @@ static int x2apic_apic_id_registered(void)
return 1;
}
-static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
+static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
{
int cpu;
@@ -97,13 +117,28 @@ static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
* We're using fixed IRQ delivery, can only return one phys APIC ID.
* May as well be the first.
*/
- cpu = first_cpu(cpumask);
- if ((unsigned)cpu < NR_CPUS)
+ cpu = cpumask_first(cpumask);
+ if ((unsigned)cpu < nr_cpu_ids)
return per_cpu(x86_cpu_to_logical_apicid, cpu);
else
return BAD_APICID;
}
+static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+ const struct cpumask *andmask)
+{
+ int cpu;
+
+ /*
+ * We're using fixed IRQ delivery, can only return one phys APIC ID.
+ * May as well be the first.
+ */
+ cpu = cpumask_any_and(cpumask, andmask);
+ if (cpu < nr_cpu_ids)
+ return per_cpu(x86_cpu_to_apicid, cpu);
+ return BAD_APICID;
+}
+
static unsigned int get_apic_id(unsigned long x)
{
unsigned int id;
@@ -150,8 +185,10 @@ struct genapic apic_x2apic_cluster = {
.send_IPI_all = x2apic_send_IPI_all,
.send_IPI_allbutself = x2apic_send_IPI_allbutself,
.send_IPI_mask = x2apic_send_IPI_mask,
+ .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
.send_IPI_self = x2apic_send_IPI_self,
.cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
.phys_pkg_id = phys_pkg_id,
.get_apic_id = get_apic_id,
.set_apic_id = set_apic_id,
diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c
index d042211768b..b255507884f 100644
--- a/arch/x86/kernel/genx2apic_phys.c
+++ b/arch/x86/kernel/genx2apic_phys.c
@@ -29,16 +29,15 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
-static cpumask_t x2apic_target_cpus(void)
+static const struct cpumask *x2apic_target_cpus(void)
{
- return cpumask_of_cpu(0);
+ return cpumask_of(0);
}
-static cpumask_t x2apic_vector_allocation_domain(int cpu)
+static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
{
- cpumask_t domain = CPU_MASK_NONE;
- cpu_set(cpu, domain);
- return domain;
+ cpumask_clear(retmask);
+ cpumask_set_cpu(cpu, retmask);
}
static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
@@ -54,32 +53,54 @@ static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
x2apic_icr_write(cfg, apicid);
}
-static void x2apic_send_IPI_mask(cpumask_t mask, int vector)
+static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
{
unsigned long flags;
unsigned long query_cpu;
local_irq_save(flags);
- for_each_cpu_mask(query_cpu, mask) {
+ for_each_cpu(query_cpu, mask) {
__x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
vector, APIC_DEST_PHYSICAL);
}
local_irq_restore(flags);
}
-static void x2apic_send_IPI_allbutself(int vector)
+static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask,
+ int vector)
{
- cpumask_t mask = cpu_online_map;
+ unsigned long flags;
+ unsigned long query_cpu;
+ unsigned long this_cpu = smp_processor_id();
+
+ local_irq_save(flags);
+ for_each_cpu(query_cpu, mask) {
+ if (query_cpu != this_cpu)
+ __x2apic_send_IPI_dest(
+ per_cpu(x86_cpu_to_apicid, query_cpu),
+ vector, APIC_DEST_PHYSICAL);
+ }
+ local_irq_restore(flags);
+}
- cpu_clear(smp_processor_id(), mask);
+static void x2apic_send_IPI_allbutself(int vector)
+{
+ unsigned long flags;
+ unsigned long query_cpu;
+ unsigned long this_cpu = smp_processor_id();
- if (!cpus_empty(mask))
- x2apic_send_IPI_mask(mask, vector);
+ local_irq_save(flags);
+ for_each_online_cpu(query_cpu)
+ if (query_cpu != this_cpu)
+ __x2apic_send_IPI_dest(
+ per_cpu(x86_cpu_to_apicid, query_cpu),
+ vector, APIC_DEST_PHYSICAL);
+ local_irq_restore(flags);
}
static void x2apic_send_IPI_all(int vector)
{
- x2apic_send_IPI_mask(cpu_online_map, vector);
+ x2apic_send_IPI_mask(cpu_online_mask, vector);
}
static int x2apic_apic_id_registered(void)
@@ -87,7 +108,7 @@ static int x2apic_apic_id_registered(void)
return 1;
}
-static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
+static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
{
int cpu;
@@ -95,13 +116,28 @@ static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
* We're using fixed IRQ delivery, can only return one phys APIC ID.
* May as well be the first.
*/
- cpu = first_cpu(cpumask);
- if ((unsigned)cpu < NR_CPUS)
+ cpu = cpumask_first(cpumask);
+ if ((unsigned)cpu < nr_cpu_ids)
return per_cpu(x86_cpu_to_apicid, cpu);
else
return BAD_APICID;
}
+static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+ const struct cpumask *andmask)
+{
+ int cpu;
+
+ /*
+ * We're using fixed IRQ delivery, can only return one phys APIC ID.
+ * May as well be the first.
+ */
+ cpu = cpumask_any_and(cpumask, andmask);
+ if (cpu < nr_cpu_ids)
+ return per_cpu(x86_cpu_to_apicid, cpu);
+ return BAD_APICID;
+}
+
static unsigned int get_apic_id(unsigned long x)
{
unsigned int id;
@@ -145,8 +181,10 @@ struct genapic apic_x2apic_phys = {
.send_IPI_all = x2apic_send_IPI_all,
.send_IPI_allbutself = x2apic_send_IPI_allbutself,
.send_IPI_mask = x2apic_send_IPI_mask,
+ .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
.send_IPI_self = x2apic_send_IPI_self,
.cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
.phys_pkg_id = phys_pkg_id,
.get_apic_id = get_apic_id,
.set_apic_id = set_apic_id,
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index 2c7dbdb9827..3984682cd84 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -75,16 +75,15 @@ EXPORT_SYMBOL(sn_rtc_cycles_per_second);
/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
-static cpumask_t uv_target_cpus(void)
+static const struct cpumask *uv_target_cpus(void)
{
- return cpumask_of_cpu(0);
+ return cpumask_of(0);
}
-static cpumask_t uv_vector_allocation_domain(int cpu)
+static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
{
- cpumask_t domain = CPU_MASK_NONE;
- cpu_set(cpu, domain);
- return domain;
+ cpumask_clear(retmask);
+ cpumask_set_cpu(cpu, retmask);
}
int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip)
@@ -123,28 +122,37 @@ static void uv_send_IPI_one(int cpu, int vector)
uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
}
-static void uv_send_IPI_mask(cpumask_t mask, int vector)
+static void uv_send_IPI_mask(const struct cpumask *mask, int vector)
{
unsigned int cpu;
- for_each_possible_cpu(cpu)
- if (cpu_isset(cpu, mask))
+ for_each_cpu(cpu, mask)
+ uv_send_IPI_one(cpu, vector);
+}
+
+static void uv_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+{
+ unsigned int cpu;
+ unsigned int this_cpu = smp_processor_id();
+
+ for_each_cpu(cpu, mask)
+ if (cpu != this_cpu)
uv_send_IPI_one(cpu, vector);
}
static void uv_send_IPI_allbutself(int vector)
{
- cpumask_t mask = cpu_online_map;
-
- cpu_clear(smp_processor_id(), mask);
+ unsigned int cpu;
+ unsigned int this_cpu = smp_processor_id();
- if (!cpus_empty(mask))
- uv_send_IPI_mask(mask, vector);
+ for_each_online_cpu(cpu)
+ if (cpu != this_cpu)
+ uv_send_IPI_one(cpu, vector);
}
static void uv_send_IPI_all(int vector)
{
- uv_send_IPI_mask(cpu_online_map, vector);
+ uv_send_IPI_mask(cpu_online_mask, vector);
}
static int uv_apic_id_registered(void)
@@ -156,7 +164,7 @@ static void uv_init_apic_ldr(void)
{
}
-static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
+static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask)
{
int cpu;
@@ -164,13 +172,28 @@ static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
* We're using fixed IRQ delivery, can only return one phys APIC ID.
* May as well be the first.
*/
- cpu = first_cpu(cpumask);
+ cpu = cpumask_first(cpumask);
if ((unsigned)cpu < nr_cpu_ids)
return per_cpu(x86_cpu_to_apicid, cpu);
else
return BAD_APICID;
}
+static unsigned int uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+ const struct cpumask *andmask)
+{
+ int cpu;
+
+ /*
+ * We're using fixed IRQ delivery, can only return one phys APIC ID.
+ * May as well be the first.
+ */
+ cpu = cpumask_any_and(cpumask, andmask);
+ if (cpu < nr_cpu_ids)
+ return per_cpu(x86_cpu_to_apicid, cpu);
+ return BAD_APICID;
+}
+
static unsigned int get_apic_id(unsigned long x)
{
unsigned int id;
@@ -218,8 +241,10 @@ struct genapic apic_x2apic_uv_x = {
.send_IPI_all = uv_send_IPI_all,
.send_IPI_allbutself = uv_send_IPI_allbutself,
.send_IPI_mask = uv_send_IPI_mask,
+ .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself,
.send_IPI_self = uv_send_IPI_self,
.cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and,
.phys_pkg_id = phys_pkg_id,
.get_apic_id = get_apic_id,
.set_apic_id = set_apic_id,
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 77017e834cf..e76d7e27297 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -246,7 +246,7 @@ static void hpet_legacy_clockevent_register(void)
* Start hpet with the boot cpu mask and make it
* global after the IO_APIC has been initialized.
*/
- hpet_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
+ hpet_clockevent.cpumask = cpumask_of(smp_processor_id());
clockevents_register_device(&hpet_clockevent);
global_clock_event = &hpet_clockevent;
printk(KERN_DEBUG "hpet clockevent registered\n");
@@ -301,7 +301,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
hpet_setup_msi_irq(hdev->irq);
disable_irq(hdev->irq);
- irq_set_affinity(hdev->irq, cpumask_of_cpu(hdev->cpu));
+ irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
enable_irq(hdev->irq);
}
break;
@@ -322,7 +322,7 @@ static int hpet_next_event(unsigned long delta,
* what we wrote hit the chip before we compare it to the
* counter.
*/
- WARN_ON((u32)hpet_readl(HPET_T0_CMP) != cnt);
+ WARN_ON_ONCE((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt);
return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
}
@@ -445,11 +445,11 @@ static int hpet_setup_irq(struct hpet_dev *dev)
{
if (request_irq(dev->irq, hpet_interrupt_handler,
- IRQF_SHARED|IRQF_NOBALANCING, dev->name, dev))
+ IRQF_DISABLED|IRQF_NOBALANCING, dev->name, dev))
return -1;
disable_irq(dev->irq);
- irq_set_affinity(dev->irq, cpumask_of_cpu(dev->cpu));
+ irq_set_affinity(dev->irq, cpumask_of(dev->cpu));
enable_irq(dev->irq);
printk(KERN_DEBUG "hpet: %s irq %d for MSI\n",
@@ -500,7 +500,7 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
/* 5 usec minimum reprogramming delta. */
evt->min_delta_ns = 5000;
- evt->cpumask = cpumask_of_cpu(hdev->cpu);
+ evt->cpumask = cpumask_of(hdev->cpu);
clockevents_register_device(evt);
}
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 1f20608d4ca..b0f61f0dcd0 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -58,7 +58,7 @@ void __cpuinit mxcsr_feature_mask_init(void)
stts();
}
-void __init init_thread_xstate(void)
+void __cpuinit init_thread_xstate(void)
{
if (!HAVE_HWFP) {
xstate_size = sizeof(struct i387_soft_struct);
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index c1b5e3ece1f..10f92fb532f 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -114,7 +114,7 @@ void __init setup_pit_timer(void)
* Start pit with the boot cpu mask and make it global after the
* IO_APIC has been initialized.
*/
- pit_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
+ pit_clockevent.cpumask = cpumask_of(smp_processor_id());
pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC,
pit_clockevent.shift);
pit_clockevent.max_delta_ns =
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index 7a3f2028e2e..58938cc4b7d 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -108,94 +108,273 @@ static int __init parse_noapic(char *str)
early_param("noapic", parse_noapic);
struct irq_pin_list;
+
+/*
+ * This is performance-critical, we want to do it O(1)
+ *
+ * the indexing order of this array favors 1:1 mappings
+ * between pins and IRQs.
+ */
+
+struct irq_pin_list {
+ int apic, pin;
+ struct irq_pin_list *next;
+};
+
+static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
+{
+ struct irq_pin_list *pin;
+ int node;
+
+ node = cpu_to_node(cpu);
+
+ pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
+ printk(KERN_DEBUG " alloc irq_2_pin on cpu %d node %d\n", cpu, node);
+
+ return pin;
+}
+
struct irq_cfg {
- unsigned int irq;
struct irq_pin_list *irq_2_pin;
- cpumask_t domain;
- cpumask_t old_domain;
+ cpumask_var_t domain;
+ cpumask_var_t old_domain;
unsigned move_cleanup_count;
u8 vector;
u8 move_in_progress : 1;
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+ u8 move_desc_pending : 1;
+#endif
};
/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
+#ifdef CONFIG_SPARSE_IRQ
+static struct irq_cfg irq_cfgx[] = {
+#else
static struct irq_cfg irq_cfgx[NR_IRQS] = {
- [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, },
- [1] = { .irq = 1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, },
- [2] = { .irq = 2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, },
- [3] = { .irq = 3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, },
- [4] = { .irq = 4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, },
- [5] = { .irq = 5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, },
- [6] = { .irq = 6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, },
- [7] = { .irq = 7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, },
- [8] = { .irq = 8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, },
- [9] = { .irq = 9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, },
- [10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
- [11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
- [12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
- [13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
- [14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
- [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
+#endif
+ [0] = { .vector = IRQ0_VECTOR, },
+ [1] = { .vector = IRQ1_VECTOR, },
+ [2] = { .vector = IRQ2_VECTOR, },
+ [3] = { .vector = IRQ3_VECTOR, },
+ [4] = { .vector = IRQ4_VECTOR, },
+ [5] = { .vector = IRQ5_VECTOR, },
+ [6] = { .vector = IRQ6_VECTOR, },
+ [7] = { .vector = IRQ7_VECTOR, },
+ [8] = { .vector = IRQ8_VECTOR, },
+ [9] = { .vector = IRQ9_VECTOR, },
+ [10] = { .vector = IRQ10_VECTOR, },
+ [11] = { .vector = IRQ11_VECTOR, },
+ [12] = { .vector = IRQ12_VECTOR, },
+ [13] = { .vector = IRQ13_VECTOR, },
+ [14] = { .vector = IRQ14_VECTOR, },
+ [15] = { .vector = IRQ15_VECTOR, },
};
-#define for_each_irq_cfg(irq, cfg) \
- for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++)
+void __init arch_early_irq_init(void)
+{
+ struct irq_cfg *cfg;
+ struct irq_desc *desc;
+ int count;
+ int i;
+ cfg = irq_cfgx;
+ count = ARRAY_SIZE(irq_cfgx);
+
+ for (i = 0; i < count; i++) {
+ desc = irq_to_desc(i);
+ desc->chip_data = &cfg[i];
+ alloc_bootmem_cpumask_var(&cfg[i].domain);
+ alloc_bootmem_cpumask_var(&cfg[i].old_domain);
+ if (i < NR_IRQS_LEGACY)
+ cpumask_setall(cfg[i].domain);
+ }
+}
+
+#ifdef CONFIG_SPARSE_IRQ
static struct irq_cfg *irq_cfg(unsigned int irq)
{
- return irq < nr_irqs ? irq_cfgx + irq : NULL;
+ struct irq_cfg *cfg = NULL;
+ struct irq_desc *desc;
+
+ desc = irq_to_desc(irq);
+ if (desc)
+ cfg = desc->chip_data;
+
+ return cfg;
}
-static struct irq_cfg *irq_cfg_alloc(unsigned int irq)
+static struct irq_cfg *get_one_free_irq_cfg(int cpu)
{
- return irq_cfg(irq);
+ struct irq_cfg *cfg;
+ int node;
+
+ node = cpu_to_node(cpu);
+
+ cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
+ if (cfg) {
+ /* FIXME: needs alloc_cpumask_var_node() */
+ if (!alloc_cpumask_var(&cfg->domain, GFP_ATOMIC)) {
+ kfree(cfg);
+ cfg = NULL;
+ } else if (!alloc_cpumask_var(&cfg->old_domain, GFP_ATOMIC)) {
+ free_cpumask_var(cfg->domain);
+ kfree(cfg);
+ cfg = NULL;
+ } else {
+ cpumask_clear(cfg->domain);
+ cpumask_clear(cfg->old_domain);
+ }
+ }
+ printk(KERN_DEBUG " alloc irq_cfg on cpu %d node %d\n", cpu, node);
+
+ return cfg;
}
-/*
- * Rough estimation of how many shared IRQs there are, can be changed
- * anytime.
- */
-#define MAX_PLUS_SHARED_IRQS NR_IRQS
-#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
+void arch_init_chip_data(struct irq_desc *desc, int cpu)
+{
+ struct irq_cfg *cfg;
-/*
- * This is performance-critical, we want to do it O(1)
- *
- * the indexing order of this array favors 1:1 mappings
- * between pins and IRQs.
- */
+ cfg = desc->chip_data;
+ if (!cfg) {
+ desc->chip_data = get_one_free_irq_cfg(cpu);
+ if (!desc->chip_data) {
+ printk(KERN_ERR "can not alloc irq_cfg\n");
+ BUG_ON(1);
+ }
+ }
+}
-struct irq_pin_list {
- int apic, pin;
- struct irq_pin_list *next;
-};
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE];
-static struct irq_pin_list *irq_2_pin_ptr;
+static void
+init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
+{
+ struct irq_pin_list *old_entry, *head, *tail, *entry;
-static void __init irq_2_pin_init(void)
+ cfg->irq_2_pin = NULL;
+ old_entry = old_cfg->irq_2_pin;
+ if (!old_entry)
+ return;
+
+ entry = get_one_free_irq_2_pin(cpu);
+ if (!entry)
+ return;
+
+ entry->apic = old_entry->apic;
+ entry->pin = old_entry->pin;
+ head = entry;
+ tail = entry;
+ old_entry = old_entry->next;
+ while (old_entry) {
+ entry = get_one_free_irq_2_pin(cpu);
+ if (!entry) {
+ entry = head;
+ while (entry) {
+ head = entry->next;
+ kfree(entry);
+ entry = head;
+ }
+ /* still use the old one */
+ return;
+ }
+ entry->apic = old_entry->apic;
+ entry->pin = old_entry->pin;
+ tail->next = entry;
+ tail = entry;
+ old_entry = old_entry->next;
+ }
+
+ tail->next = NULL;
+ cfg->irq_2_pin = head;
+}
+
+static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
{
- struct irq_pin_list *pin = irq_2_pin_head;
- int i;
+ struct irq_pin_list *entry, *next;
+
+ if (old_cfg->irq_2_pin == cfg->irq_2_pin)
+ return;
- for (i = 1; i < PIN_MAP_SIZE; i++)
- pin[i-1].next = &pin[i];
+ entry = old_cfg->irq_2_pin;
- irq_2_pin_ptr = &pin[0];
+ while (entry) {
+ next = entry->next;
+ kfree(entry);
+ entry = next;
+ }
+ old_cfg->irq_2_pin = NULL;
}
-static struct irq_pin_list *get_one_free_irq_2_pin(void)
+void arch_init_copy_chip_data(struct irq_desc *old_desc,
+ struct irq_desc *desc, int cpu)
{
- struct irq_pin_list *pin = irq_2_pin_ptr;
+ struct irq_cfg *cfg;
+ struct irq_cfg *old_cfg;
- if (!pin)
- panic("can not get more irq_2_pin\n");
+ cfg = get_one_free_irq_cfg(cpu);
- irq_2_pin_ptr = pin->next;
- pin->next = NULL;
- return pin;
+ if (!cfg)
+ return;
+
+ desc->chip_data = cfg;
+
+ old_cfg = old_desc->chip_data;
+
+ memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
+
+ init_copy_irq_2_pin(old_cfg, cfg, cpu);
+}
+
+static void free_irq_cfg(struct irq_cfg *old_cfg)
+{
+ kfree(old_cfg);
}
+void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
+{
+ struct irq_cfg *old_cfg, *cfg;
+
+ old_cfg = old_desc->chip_data;
+ cfg = desc->chip_data;
+
+ if (old_cfg == cfg)
+ return;
+
+ if (old_cfg) {
+ free_irq_2_pin(old_cfg, cfg);
+ free_irq_cfg(old_cfg);
+ old_desc->chip_data = NULL;
+ }
+}
+
+static void
+set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
+{
+ struct irq_cfg *cfg = desc->chip_data;
+
+ if (!cfg->move_in_progress) {
+ /* it means that domain is not changed */
+ if (!cpumask_intersects(&desc->affinity, mask))
+ cfg->move_desc_pending = 1;
+ }
+}
+#endif
+
+#else
+static struct irq_cfg *irq_cfg(unsigned int irq)
+{
+ return irq < nr_irqs ? irq_cfgx + irq : NULL;
+}
+
+#endif
+
+#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC
+static inline void
+set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
+{
+}
+#endif
+
struct io_apic {
unsigned int index;
unsigned int unused[3];
@@ -237,11 +416,10 @@ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned
writel(value, &io_apic->data);
}
-static bool io_apic_level_ack_pending(unsigned int irq)
+static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
{
struct irq_pin_list *entry;
unsigned long flags;
- struct irq_cfg *cfg = irq_cfg(irq);
spin_lock_irqsave(&ioapic_lock, flags);
entry = cfg->irq_2_pin;
@@ -323,13 +501,32 @@ static void ioapic_mask_entry(int apic, int pin)
}
#ifdef CONFIG_SMP
-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
+static void send_cleanup_vector(struct irq_cfg *cfg)
+{
+ cpumask_var_t cleanup_mask;
+
+ if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
+ unsigned int i;
+ cfg->move_cleanup_count = 0;
+ for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+ cfg->move_cleanup_count++;
+ for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+ send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
+ } else {
+ cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
+ cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
+ send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+ free_cpumask_var(cleanup_mask);
+ }
+ cfg->move_in_progress = 0;
+}
+
+static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
{
int apic, pin;
- struct irq_cfg *cfg;
struct irq_pin_list *entry;
+ u8 vector = cfg->vector;
- cfg = irq_cfg(irq);
entry = cfg->irq_2_pin;
for (;;) {
unsigned int reg;
@@ -359,36 +556,61 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
}
}
-static int assign_irq_vector(int irq, cpumask_t mask);
+static int
+assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
+
+/*
+ * Either sets desc->affinity to a valid value, and returns cpu_mask_to_apicid
+ * of that, or returns BAD_APICID and leaves desc->affinity untouched.
+ */
+static unsigned int
+set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
+{
+ struct irq_cfg *cfg;
+ unsigned int irq;
+
+ if (!cpumask_intersects(mask, cpu_online_mask))
+ return BAD_APICID;
-static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+ irq = desc->irq;
+ cfg = desc->chip_data;
+ if (assign_irq_vector(irq, cfg, mask))
+ return BAD_APICID;
+
+ cpumask_and(&desc->affinity, cfg->domain, mask);
+ set_extra_move_desc(desc, mask);
+ return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask);
+}
+
+static void
+set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
{
struct irq_cfg *cfg;
unsigned long flags;
unsigned int dest;
- cpumask_t tmp;
- struct irq_desc *desc;
+ unsigned int irq;
- cpus_and(tmp, mask, cpu_online_map);
- if (cpus_empty(tmp))
- return;
+ irq = desc->irq;
+ cfg = desc->chip_data;
- cfg = irq_cfg(irq);
- if (assign_irq_vector(irq, mask))
- return;
+ spin_lock_irqsave(&ioapic_lock, flags);
+ dest = set_desc_affinity(desc, mask);
+ if (dest != BAD_APICID) {
+ /* Only the high 8 bits are valid. */
+ dest = SET_APIC_LOGICAL_ID(dest);
+ __target_IO_APIC_irq(irq, dest, cfg);
+ }
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+}
- cpus_and(tmp, cfg->domain, mask);
- dest = cpu_mask_to_apicid(tmp);
- /*
- * Only the high 8 bits are valid.
- */
- dest = SET_APIC_LOGICAL_ID(dest);
+static void
+set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
+{
+ struct irq_desc *desc;
desc = irq_to_desc(irq);
- spin_lock_irqsave(&ioapic_lock, flags);
- __target_IO_APIC_irq(irq, dest, cfg->vector);
- desc->affinity = mask;
- spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ set_ioapic_affinity_irq_desc(desc, mask);
}
#endif /* CONFIG_SMP */
@@ -397,16 +619,18 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
* shared ISA-space IRQs, so we have to support them. We are super
* fast in the common case, and fast for shared ISA-space IRQs.
*/
-static void add_pin_to_irq(unsigned int irq, int apic, int pin)
+static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
{
- struct irq_cfg *cfg;
struct irq_pin_list *entry;
- /* first time to refer irq_cfg, so with new */
- cfg = irq_cfg_alloc(irq);
entry = cfg->irq_2_pin;
if (!entry) {
- entry = get_one_free_irq_2_pin();
+ entry = get_one_free_irq_2_pin(cpu);
+ if (!entry) {
+ printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
+ apic, pin);
+ return;
+ }
cfg->irq_2_pin = entry;
entry->apic = apic;
entry->pin = pin;
@@ -421,7 +645,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
entry = entry->next;
}
- entry->next = get_one_free_irq_2_pin();
+ entry->next = get_one_free_irq_2_pin(cpu);
entry = entry->next;
entry->apic = apic;
entry->pin = pin;
@@ -430,11 +654,10 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
/*
* Reroute an IRQ to a different pin.
*/
-static void __init replace_pin_at_irq(unsigned int irq,
+static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
int oldapic, int oldpin,
int newapic, int newpin)
{
- struct irq_cfg *cfg = irq_cfg(irq);
struct irq_pin_list *entry = cfg->irq_2_pin;
int replaced = 0;
@@ -451,18 +674,16 @@ static void __init replace_pin_at_irq(unsigned int irq,
/* why? call replace before add? */
if (!replaced)
- add_pin_to_irq(irq, newapic, newpin);
+ add_pin_to_irq_cpu(cfg, cpu, newapic, newpin);
}
-static inline void io_apic_modify_irq(unsigned int irq,
+static inline void io_apic_modify_irq(struct irq_cfg *cfg,
int mask_and, int mask_or,
void (*final)(struct irq_pin_list *entry))
{
int pin;
- struct irq_cfg *cfg;
struct irq_pin_list *entry;
- cfg = irq_cfg(irq);
for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) {
unsigned int reg;
pin = entry->pin;
@@ -475,9 +696,9 @@ static inline void io_apic_modify_irq(unsigned int irq,
}
}
-static void __unmask_IO_APIC_irq(unsigned int irq)
+static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
{
- io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL);
+ io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
}
#ifdef CONFIG_X86_64
@@ -492,47 +713,64 @@ void io_apic_sync(struct irq_pin_list *entry)
readl(&io_apic->data);
}
-static void __mask_IO_APIC_irq(unsigned int irq)
+static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
{
- io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+ io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
}
#else /* CONFIG_X86_32 */
-static void __mask_IO_APIC_irq(unsigned int irq)
+static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
{
- io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL);
+ io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL);
}
-static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
+static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
{
- io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER,
+ io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,
IO_APIC_REDIR_MASKED, NULL);
}
-static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
+static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
{
- io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED,
+ io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
}
#endif /* CONFIG_X86_32 */
-static void mask_IO_APIC_irq (unsigned int irq)
+static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
{
+ struct irq_cfg *cfg = desc->chip_data;
unsigned long flags;
+ BUG_ON(!cfg);
+
spin_lock_irqsave(&ioapic_lock, flags);
- __mask_IO_APIC_irq(irq);
+ __mask_IO_APIC_irq(cfg);
spin_unlock_irqrestore(&ioapic_lock, flags);
}
-static void unmask_IO_APIC_irq (unsigned int irq)
+static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
{
+ struct irq_cfg *cfg = desc->chip_data;
unsigned long flags;
spin_lock_irqsave(&ioapic_lock, flags);
- __unmask_IO_APIC_irq(irq);
+ __unmask_IO_APIC_irq(cfg);
spin_unlock_irqrestore(&ioapic_lock, flags);
}
+static void mask_IO_APIC_irq(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ mask_IO_APIC_irq_desc(desc);
+}
+static void unmask_IO_APIC_irq(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ unmask_IO_APIC_irq_desc(desc);
+}
+
static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
{
struct IO_APIC_route_entry entry;
@@ -809,7 +1047,7 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
*/
static int EISA_ELCR(unsigned int irq)
{
- if (irq < 16) {
+ if (irq < NR_IRQS_LEGACY) {
unsigned int port = 0x4d0 + (irq >> 3);
return (inb(port) >> (irq & 7)) & 1;
}
@@ -1034,7 +1272,8 @@ void unlock_vector_lock(void)
spin_unlock(&vector_lock);
}
-static int __assign_irq_vector(int irq, cpumask_t mask)
+static int
+__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
{
/*
* NOTE! The local APIC isn't very good at handling
@@ -1049,39 +1288,39 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
*/
static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
unsigned int old_vector;
- int cpu;
- struct irq_cfg *cfg;
-
- cfg = irq_cfg(irq);
-
- /* Only try and allocate irqs on cpus that are present */
- cpus_and(mask, mask, cpu_online_map);
+ int cpu, err;
+ cpumask_var_t tmp_mask;
if ((cfg->move_in_progress) || cfg->move_cleanup_count)
return -EBUSY;
+ if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
+ return -ENOMEM;
+
old_vector = cfg->vector;
if (old_vector) {
- cpumask_t tmp;
- cpus_and(tmp, cfg->domain, mask);
- if (!cpus_empty(tmp))
+ cpumask_and(tmp_mask, mask, cpu_online_mask);
+ cpumask_and(tmp_mask, cfg->domain, tmp_mask);
+ if (!cpumask_empty(tmp_mask)) {
+ free_cpumask_var(tmp_mask);
return 0;
+ }
}
- for_each_cpu_mask_nr(cpu, mask) {
- cpumask_t domain, new_mask;
+ /* Only try and allocate irqs on cpus that are present */
+ err = -ENOSPC;
+ for_each_cpu_and(cpu, mask, cpu_online_mask) {
int new_cpu;
int vector, offset;
- domain = vector_allocation_domain(cpu);
- cpus_and(new_mask, domain, cpu_online_map);
+ vector_allocation_domain(cpu, tmp_mask);
vector = current_vector;
offset = current_offset;
next:
vector += 8;
if (vector >= first_system_vector) {
- /* If we run out of vectors on large boxen, must share them. */
+ /* If out of vectors on large boxen, must share them. */
offset = (offset + 1) % 8;
vector = FIRST_DEVICE_VECTOR + offset;
}
@@ -1094,7 +1333,7 @@ next:
if (vector == SYSCALL_VECTOR)
goto next;
#endif
- for_each_cpu_mask_nr(new_cpu, new_mask)
+ for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
if (per_cpu(vector_irq, new_cpu)[vector] != -1)
goto next;
/* Found one! */
@@ -1102,44 +1341,56 @@ next:
current_offset = offset;
if (old_vector) {
cfg->move_in_progress = 1;
- cfg->old_domain = cfg->domain;
+ cpumask_copy(cfg->old_domain, cfg->domain);
}
- for_each_cpu_mask_nr(new_cpu, new_mask)
+ for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
per_cpu(vector_irq, new_cpu)[vector] = irq;
cfg->vector = vector;
- cfg->domain = domain;
- return 0;
+ cpumask_copy(cfg->domain, tmp_mask);
+ err = 0;
+ break;
}
- return -ENOSPC;
+ free_cpumask_var(tmp_mask);
+ return err;
}
-static int assign_irq_vector(int irq, cpumask_t mask)
+static int
+assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
{
int err;
unsigned long flags;
spin_lock_irqsave(&vector_lock, flags);
- err = __assign_irq_vector(irq, mask);
+ err = __assign_irq_vector(irq, cfg, mask);
spin_unlock_irqrestore(&vector_lock, flags);
return err;
}
-static void __clear_irq_vector(int irq)
+static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
{
- struct irq_cfg *cfg;
- cpumask_t mask;
int cpu, vector;
- cfg = irq_cfg(irq);
BUG_ON(!cfg->vector);
vector = cfg->vector;
- cpus_and(mask, cfg->domain, cpu_online_map);
- for_each_cpu_mask_nr(cpu, mask)
+ for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
per_cpu(vector_irq, cpu)[vector] = -1;
cfg->vector = 0;
- cpus_clear(cfg->domain);
+ cpumask_clear(cfg->domain);
+
+ if (likely(!cfg->move_in_progress))
+ return;
+ for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
+ for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
+ vector++) {
+ if (per_cpu(vector_irq, cpu)[vector] != irq)
+ continue;
+ per_cpu(vector_irq, cpu)[vector] = -1;
+ break;
+ }
+ }
+ cfg->move_in_progress = 0;
}
void __setup_vector_irq(int cpu)
@@ -1148,10 +1399,14 @@ void __setup_vector_irq(int cpu)
/* This function must be called with vector_lock held */
int irq, vector;
struct irq_cfg *cfg;
+ struct irq_desc *desc;
/* Mark the inuse vectors */
- for_each_irq_cfg(irq, cfg) {
- if (!cpu_isset(cpu, cfg->domain))
+ for_each_irq_desc(irq, desc) {
+ if (!desc)
+ continue;
+ cfg = desc->chip_data;
+ if (!cpumask_test_cpu(cpu, cfg->domain))
continue;
vector = cfg->vector;
per_cpu(vector_irq, cpu)[vector] = irq;
@@ -1163,7 +1418,7 @@ void __setup_vector_irq(int cpu)
continue;
cfg = irq_cfg(irq);
- if (!cpu_isset(cpu, cfg->domain))
+ if (!cpumask_test_cpu(cpu, cfg->domain))
per_cpu(vector_irq, cpu)[vector] = -1;
}
}
@@ -1201,11 +1456,8 @@ static inline int IO_APIC_irq_trigger(int irq)
}
#endif
-static void ioapic_register_intr(int irq, unsigned long trigger)
+static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger)
{
- struct irq_desc *desc;
-
- desc = irq_to_desc(irq);
if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
trigger == IOAPIC_LEVEL)
@@ -1297,23 +1549,22 @@ static int setup_ioapic_entry(int apic, int irq,
return 0;
}
-static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
+static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_desc *desc,
int trigger, int polarity)
{
struct irq_cfg *cfg;
struct IO_APIC_route_entry entry;
- cpumask_t mask;
+ unsigned int dest;
if (!IO_APIC_IRQ(irq))
return;
- cfg = irq_cfg(irq);
+ cfg = desc->chip_data;
- mask = TARGET_CPUS;
- if (assign_irq_vector(irq, mask))
+ if (assign_irq_vector(irq, cfg, TARGET_CPUS))
return;
- cpus_and(mask, cfg->domain, mask);
+ dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
apic_printk(APIC_VERBOSE,KERN_DEBUG
"IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
@@ -1323,16 +1574,15 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry,
- cpu_mask_to_apicid(mask), trigger, polarity,
- cfg->vector)) {
+ dest, trigger, polarity, cfg->vector)) {
printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
mp_ioapics[apic].mp_apicid, pin);
- __clear_irq_vector(irq);
+ __clear_irq_vector(irq, cfg);
return;
}
- ioapic_register_intr(irq, trigger);
- if (irq < 16)
+ ioapic_register_intr(irq, desc, trigger);
+ if (irq < NR_IRQS_LEGACY)
disable_8259A_irq(irq);
ioapic_write_entry(apic, pin, entry);
@@ -1342,6 +1592,9 @@ static void __init setup_IO_APIC_irqs(void)
{
int apic, pin, idx, irq;
int notcon = 0;
+ struct irq_desc *desc;
+ struct irq_cfg *cfg;
+ int cpu = boot_cpu_id;
apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
@@ -1373,9 +1626,15 @@ static void __init setup_IO_APIC_irqs(void)
if (multi_timer_check(apic, irq))
continue;
#endif
- add_pin_to_irq(irq, apic, pin);
+ desc = irq_to_desc_alloc_cpu(irq, cpu);
+ if (!desc) {
+ printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+ continue;
+ }
+ cfg = desc->chip_data;
+ add_pin_to_irq_cpu(cfg, cpu, apic, pin);
- setup_IO_APIC_irq(apic, pin, irq,
+ setup_IO_APIC_irq(apic, pin, irq, desc,
irq_trigger(idx), irq_polarity(idx));
}
}
@@ -1434,6 +1693,7 @@ __apicdebuginit(void) print_IO_APIC(void)
union IO_APIC_reg_03 reg_03;
unsigned long flags;
struct irq_cfg *cfg;
+ struct irq_desc *desc;
unsigned int irq;
if (apic_verbosity == APIC_QUIET)
@@ -1523,8 +1783,13 @@ __apicdebuginit(void) print_IO_APIC(void)
}
}
printk(KERN_DEBUG "IRQ to pin mappings:\n");
- for_each_irq_cfg(irq, cfg) {
- struct irq_pin_list *entry = cfg->irq_2_pin;
+ for_each_irq_desc(irq, desc) {
+ struct irq_pin_list *entry;
+
+ if (!desc)
+ continue;
+ cfg = desc->chip_data;
+ entry = cfg->irq_2_pin;
if (!entry)
continue;
printk(KERN_DEBUG "IRQ%d ", irq);
@@ -2008,14 +2273,16 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
{
int was_pending = 0;
unsigned long flags;
+ struct irq_cfg *cfg;
spin_lock_irqsave(&ioapic_lock, flags);
- if (irq < 16) {
+ if (irq < NR_IRQS_LEGACY) {
disable_8259A_irq(irq);
if (i8259A_irq_pending(irq))
was_pending = 1;
}
- __unmask_IO_APIC_irq(irq);
+ cfg = irq_cfg(irq);
+ __unmask_IO_APIC_irq(cfg);
spin_unlock_irqrestore(&ioapic_lock, flags);
return was_pending;
@@ -2029,7 +2296,7 @@ static int ioapic_retrigger_irq(unsigned int irq)
unsigned long flags;
spin_lock_irqsave(&vector_lock, flags);
- send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector);
+ send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
spin_unlock_irqrestore(&vector_lock, flags);
return 1;
@@ -2078,35 +2345,35 @@ static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
* as simple as edge triggered migration and we can do the irq migration
* with a simple atomic update to IO-APIC RTE.
*/
-static void migrate_ioapic_irq(int irq, cpumask_t mask)
+static void
+migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
{
struct irq_cfg *cfg;
- struct irq_desc *desc;
- cpumask_t tmp, cleanup_mask;
struct irte irte;
int modify_ioapic_rte;
unsigned int dest;
unsigned long flags;
+ unsigned int irq;
- cpus_and(tmp, mask, cpu_online_map);
- if (cpus_empty(tmp))
+ if (!cpumask_intersects(mask, cpu_online_mask))
return;
+ irq = desc->irq;
if (get_irte(irq, &irte))
return;
- if (assign_irq_vector(irq, mask))
+ cfg = desc->chip_data;
+ if (assign_irq_vector(irq, cfg, mask))
return;
- cfg = irq_cfg(irq);
- cpus_and(tmp, cfg->domain, mask);
- dest = cpu_mask_to_apicid(tmp);
+ set_extra_move_desc(desc, mask);
+
+ dest = cpu_mask_to_apicid_and(cfg->domain, mask);
- desc = irq_to_desc(irq);
modify_ioapic_rte = desc->status & IRQ_LEVEL;
if (modify_ioapic_rte) {
spin_lock_irqsave(&ioapic_lock, flags);
- __target_IO_APIC_irq(irq, dest, cfg->vector);
+ __target_IO_APIC_irq(irq, dest, cfg);
spin_unlock_irqrestore(&ioapic_lock, flags);
}
@@ -2118,24 +2385,20 @@ static void migrate_ioapic_irq(int irq, cpumask_t mask)
*/
modify_irte(irq, &irte);
- if (cfg->move_in_progress) {
- cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
- cfg->move_cleanup_count = cpus_weight(cleanup_mask);
- send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
- cfg->move_in_progress = 0;
- }
+ if (cfg->move_in_progress)
+ send_cleanup_vector(cfg);
- desc->affinity = mask;
+ cpumask_copy(&desc->affinity, mask);
}
-static int migrate_irq_remapped_level(int irq)
+static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
{
int ret = -1;
- struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_cfg *cfg = desc->chip_data;
- mask_IO_APIC_irq(irq);
+ mask_IO_APIC_irq_desc(desc);
- if (io_apic_level_ack_pending(irq)) {
+ if (io_apic_level_ack_pending(cfg)) {
/*
* Interrupt in progress. Migrating irq now will change the
* vector information in the IO-APIC RTE and that will confuse
@@ -2147,14 +2410,15 @@ static int migrate_irq_remapped_level(int irq)
}
/* everthing is clear. we have right of way */
- migrate_ioapic_irq(irq, desc->pending_mask);
+ migrate_ioapic_irq_desc(desc, &desc->pending_mask);
ret = 0;
desc->status &= ~IRQ_MOVE_PENDING;
- cpus_clear(desc->pending_mask);
+ cpumask_clear(&desc->pending_mask);
unmask:
- unmask_IO_APIC_irq(irq);
+ unmask_IO_APIC_irq_desc(desc);
+
return ret;
}
@@ -2164,6 +2428,9 @@ static void ir_irq_migration(struct work_struct *work)
struct irq_desc *desc;
for_each_irq_desc(irq, desc) {
+ if (!desc)
+ continue;
+
if (desc->status & IRQ_MOVE_PENDING) {
unsigned long flags;
@@ -2175,7 +2442,7 @@ static void ir_irq_migration(struct work_struct *work)
continue;
}
- desc->chip->set_affinity(irq, desc->pending_mask);
+ desc->chip->set_affinity(irq, &desc->pending_mask);
spin_unlock_irqrestore(&desc->lock, flags);
}
}
@@ -2184,18 +2451,24 @@ static void ir_irq_migration(struct work_struct *work)
/*
* Migrates the IRQ destination in the process context.
*/
-static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+ const struct cpumask *mask)
{
- struct irq_desc *desc = irq_to_desc(irq);
-
if (desc->status & IRQ_LEVEL) {
desc->status |= IRQ_MOVE_PENDING;
- desc->pending_mask = mask;
- migrate_irq_remapped_level(irq);
+ cpumask_copy(&desc->pending_mask, mask);
+ migrate_irq_remapped_level_desc(desc);
return;
}
- migrate_ioapic_irq(irq, mask);
+ migrate_ioapic_irq_desc(desc, mask);
+}
+static void set_ir_ioapic_affinity_irq(unsigned int irq,
+ const struct cpumask *mask)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ set_ir_ioapic_affinity_irq_desc(desc, mask);
}
#endif
@@ -2215,6 +2488,9 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
struct irq_cfg *cfg;
irq = __get_cpu_var(vector_irq)[vector];
+ if (irq == -1)
+ continue;
+
desc = irq_to_desc(irq);
if (!desc)
continue;
@@ -2224,7 +2500,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
if (!cfg->move_cleanup_count)
goto unlock;
- if ((vector == cfg->vector) && cpu_isset(me, cfg->domain))
+ if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
goto unlock;
__get_cpu_var(vector_irq)[vector] = -1;
@@ -2236,28 +2512,44 @@ unlock:
irq_exit();
}
-static void irq_complete_move(unsigned int irq)
+static void irq_complete_move(struct irq_desc **descp)
{
- struct irq_cfg *cfg = irq_cfg(irq);
+ struct irq_desc *desc = *descp;
+ struct irq_cfg *cfg = desc->chip_data;
unsigned vector, me;
- if (likely(!cfg->move_in_progress))
+ if (likely(!cfg->move_in_progress)) {
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+ if (likely(!cfg->move_desc_pending))
+ return;
+
+ /* domain is not change, but affinity is changed */
+ me = smp_processor_id();
+ if (cpu_isset(me, desc->affinity)) {
+ *descp = desc = move_irq_desc(desc, me);
+ /* get the new one */
+ cfg = desc->chip_data;
+ cfg->move_desc_pending = 0;
+ }
+#endif
return;
+ }
vector = ~get_irq_regs()->orig_ax;
me = smp_processor_id();
- if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
- cpumask_t cleanup_mask;
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+ *descp = desc = move_irq_desc(desc, me);
+ /* get the new one */
+ cfg = desc->chip_data;
+#endif
- cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
- cfg->move_cleanup_count = cpus_weight(cleanup_mask);
- send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
- cfg->move_in_progress = 0;
- }
+ if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
+ send_cleanup_vector(cfg);
}
#else
-static inline void irq_complete_move(unsigned int irq) {}
+static inline void irq_complete_move(struct irq_desc **descp) {}
#endif
+
#ifdef CONFIG_INTR_REMAP
static void ack_x2apic_level(unsigned int irq)
{
@@ -2268,11 +2560,14 @@ static void ack_x2apic_edge(unsigned int irq)
{
ack_x2APIC_irq();
}
+
#endif
static void ack_apic_edge(unsigned int irq)
{
- irq_complete_move(irq);
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ irq_complete_move(&desc);
move_native_irq(irq);
ack_APIC_irq();
}
@@ -2281,18 +2576,21 @@ atomic_t irq_mis_count;
static void ack_apic_level(unsigned int irq)
{
+ struct irq_desc *desc = irq_to_desc(irq);
+
#ifdef CONFIG_X86_32
unsigned long v;
int i;
#endif
+ struct irq_cfg *cfg;
int do_unmask_irq = 0;
- irq_complete_move(irq);
+ irq_complete_move(&desc);
#ifdef CONFIG_GENERIC_PENDING_IRQ
/* If we are moving the irq we need to mask it */
- if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) {
+ if (unlikely(desc->status & IRQ_MOVE_PENDING)) {
do_unmask_irq = 1;
- mask_IO_APIC_irq(irq);
+ mask_IO_APIC_irq_desc(desc);
}
#endif
@@ -2316,7 +2614,8 @@ static void ack_apic_level(unsigned int irq)
* operation to prevent an edge-triggered interrupt escaping meanwhile.
* The idea is from Manfred Spraul. --macro
*/
- i = irq_cfg(irq)->vector;
+ cfg = desc->chip_data;
+ i = cfg->vector;
v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
#endif
@@ -2355,17 +2654,18 @@ static void ack_apic_level(unsigned int irq)
* accurate and is causing problems then it is a hardware bug
* and you can go talk to the chipset vendor about it.
*/
- if (!io_apic_level_ack_pending(irq))
+ cfg = desc->chip_data;
+ if (!io_apic_level_ack_pending(cfg))
move_masked_irq(irq);
- unmask_IO_APIC_irq(irq);
+ unmask_IO_APIC_irq_desc(desc);
}
#ifdef CONFIG_X86_32
if (!(v & (1 << (i & 0x1f)))) {
atomic_inc(&irq_mis_count);
spin_lock(&ioapic_lock);
- __mask_and_edge_IO_APIC_irq(irq);
- __unmask_and_level_IO_APIC_irq(irq);
+ __mask_and_edge_IO_APIC_irq(cfg);
+ __unmask_and_level_IO_APIC_irq(cfg);
spin_unlock(&ioapic_lock);
}
#endif
@@ -2416,20 +2716,22 @@ static inline void init_IO_APIC_traps(void)
* Also, we've got to be careful not to trash gate
* 0x80, because int 0x80 is hm, kind of importantish. ;)
*/
- for_each_irq_cfg(irq, cfg) {
- if (IO_APIC_IRQ(irq) && !cfg->vector) {
+ for_each_irq_desc(irq, desc) {
+ if (!desc)
+ continue;
+
+ cfg = desc->chip_data;
+ if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
/*
* Hmm.. We don't have an entry for this,
* so default to an old-fashioned 8259
* interrupt if we can..
*/
- if (irq < 16)
+ if (irq < NR_IRQS_LEGACY)
make_8259A_irq(irq);
- else {
- desc = irq_to_desc(irq);
+ else
/* Strange. Oh, well.. */
desc->chip = &no_irq_chip;
- }
}
}
}
@@ -2454,7 +2756,7 @@ static void unmask_lapic_irq(unsigned int irq)
apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
}
-static void ack_lapic_irq (unsigned int irq)
+static void ack_lapic_irq(unsigned int irq)
{
ack_APIC_irq();
}
@@ -2466,11 +2768,8 @@ static struct irq_chip lapic_chip __read_mostly = {
.ack = ack_lapic_irq,
};
-static void lapic_register_intr(int irq)
+static void lapic_register_intr(int irq, struct irq_desc *desc)
{
- struct irq_desc *desc;
-
- desc = irq_to_desc(irq);
desc->status &= ~IRQ_LEVEL;
set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
"edge");
@@ -2574,7 +2873,9 @@ int timer_through_8259 __initdata;
*/
static inline void __init check_timer(void)
{
- struct irq_cfg *cfg = irq_cfg(0);
+ struct irq_desc *desc = irq_to_desc(0);
+ struct irq_cfg *cfg = desc->chip_data;
+ int cpu = boot_cpu_id;
int apic1, pin1, apic2, pin2;
unsigned long flags;
unsigned int ver;
@@ -2589,7 +2890,7 @@ static inline void __init check_timer(void)
* get/set the timer IRQ vector:
*/
disable_8259A_irq(0);
- assign_irq_vector(0, TARGET_CPUS);
+ assign_irq_vector(0, cfg, TARGET_CPUS);
/*
* As IRQ0 is to be enabled in the 8259A, the virtual
@@ -2640,10 +2941,10 @@ static inline void __init check_timer(void)
* Ok, does IRQ0 through the IOAPIC work?
*/
if (no_pin1) {
- add_pin_to_irq(0, apic1, pin1);
+ add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
}
- unmask_IO_APIC_irq(0);
+ unmask_IO_APIC_irq_desc(desc);
if (timer_irq_works()) {
if (nmi_watchdog == NMI_IO_APIC) {
setup_nmi();
@@ -2669,9 +2970,9 @@ static inline void __init check_timer(void)
/*
* legacy devices should be connected to IO APIC #0
*/
- replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+ replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
- unmask_IO_APIC_irq(0);
+ unmask_IO_APIC_irq_desc(desc);
enable_8259A_irq(0);
if (timer_irq_works()) {
apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
@@ -2703,7 +3004,7 @@ static inline void __init check_timer(void)
apic_printk(APIC_QUIET, KERN_INFO
"...trying to set up timer as Virtual Wire IRQ...\n");
- lapic_register_intr(0);
+ lapic_register_intr(0, desc);
apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
enable_8259A_irq(0);
@@ -2888,22 +3189,26 @@ unsigned int create_irq_nr(unsigned int irq_want)
unsigned int irq;
unsigned int new;
unsigned long flags;
- struct irq_cfg *cfg_new;
-
- irq_want = nr_irqs - 1;
+ struct irq_cfg *cfg_new = NULL;
+ int cpu = boot_cpu_id;
+ struct irq_desc *desc_new = NULL;
irq = 0;
spin_lock_irqsave(&vector_lock, flags);
- for (new = irq_want; new > 0; new--) {
+ for (new = irq_want; new < NR_IRQS; new++) {
if (platform_legacy_irq(new))
continue;
- cfg_new = irq_cfg(new);
- if (cfg_new && cfg_new->vector != 0)
+
+ desc_new = irq_to_desc_alloc_cpu(new, cpu);
+ if (!desc_new) {
+ printk(KERN_INFO "can not get irq_desc for %d\n", new);
+ continue;
+ }
+ cfg_new = desc_new->chip_data;
+
+ if (cfg_new->vector != 0)
continue;
- /* check if need to create one */
- if (!cfg_new)
- cfg_new = irq_cfg_alloc(new);
- if (__assign_irq_vector(new, TARGET_CPUS) == 0)
+ if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0)
irq = new;
break;
}
@@ -2911,15 +3216,21 @@ unsigned int create_irq_nr(unsigned int irq_want)
if (irq > 0) {
dynamic_irq_init(irq);
+ /* restore it, in case dynamic_irq_init clear it */
+ if (desc_new)
+ desc_new->chip_data = cfg_new;
}
return irq;
}
+static int nr_irqs_gsi = NR_IRQS_LEGACY;
int create_irq(void)
{
+ unsigned int irq_want;
int irq;
- irq = create_irq_nr(nr_irqs - 1);
+ irq_want = nr_irqs_gsi;
+ irq = create_irq_nr(irq_want);
if (irq == 0)
irq = -1;
@@ -2930,14 +3241,22 @@ int create_irq(void)
void destroy_irq(unsigned int irq)
{
unsigned long flags;
+ struct irq_cfg *cfg;
+ struct irq_desc *desc;
+ /* store it, in case dynamic_irq_cleanup clear it */
+ desc = irq_to_desc(irq);
+ cfg = desc->chip_data;
dynamic_irq_cleanup(irq);
+ /* connect back irq_cfg */
+ if (desc)
+ desc->chip_data = cfg;
#ifdef CONFIG_INTR_REMAP
free_irte(irq);
#endif
spin_lock_irqsave(&vector_lock, flags);
- __clear_irq_vector(irq);
+ __clear_irq_vector(irq, cfg);
spin_unlock_irqrestore(&vector_lock, flags);
}
@@ -2950,16 +3269,13 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
struct irq_cfg *cfg;
int err;
unsigned dest;
- cpumask_t tmp;
- tmp = TARGET_CPUS;
- err = assign_irq_vector(irq, tmp);
+ cfg = irq_cfg(irq);
+ err = assign_irq_vector(irq, cfg, TARGET_CPUS);
if (err)
return err;
- cfg = irq_cfg(irq);
- cpus_and(tmp, cfg->domain, tmp);
- dest = cpu_mask_to_apicid(tmp);
+ dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
#ifdef CONFIG_INTR_REMAP
if (irq_remapped(irq)) {
@@ -3013,64 +3329,48 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
}
#ifdef CONFIG_SMP
-static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
{
+ struct irq_desc *desc = irq_to_desc(irq);
struct irq_cfg *cfg;
struct msi_msg msg;
unsigned int dest;
- cpumask_t tmp;
- struct irq_desc *desc;
- cpus_and(tmp, mask, cpu_online_map);
- if (cpus_empty(tmp))
+ dest = set_desc_affinity(desc, mask);
+ if (dest == BAD_APICID)
return;
- if (assign_irq_vector(irq, mask))
- return;
-
- cfg = irq_cfg(irq);
- cpus_and(tmp, cfg->domain, mask);
- dest = cpu_mask_to_apicid(tmp);
+ cfg = desc->chip_data;
- read_msi_msg(irq, &msg);
+ read_msi_msg_desc(desc, &msg);
msg.data &= ~MSI_DATA_VECTOR_MASK;
msg.data |= MSI_DATA_VECTOR(cfg->vector);
msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
- write_msi_msg(irq, &msg);
- desc = irq_to_desc(irq);
- desc->affinity = mask;
+ write_msi_msg_desc(desc, &msg);
}
-
#ifdef CONFIG_INTR_REMAP
/*
* Migrate the MSI irq to another cpumask. This migration is
* done in the process context using interrupt-remapping hardware.
*/
-static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+static void
+ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
{
+ struct irq_desc *desc = irq_to_desc(irq);
struct irq_cfg *cfg;
unsigned int dest;
- cpumask_t tmp, cleanup_mask;
struct irte irte;
- struct irq_desc *desc;
-
- cpus_and(tmp, mask, cpu_online_map);
- if (cpus_empty(tmp))
- return;
if (get_irte(irq, &irte))
return;
- if (assign_irq_vector(irq, mask))
+ dest = set_desc_affinity(desc, mask);
+ if (dest == BAD_APICID)
return;
- cfg = irq_cfg(irq);
- cpus_and(tmp, cfg->domain, mask);
- dest = cpu_mask_to_apicid(tmp);
-
irte.vector = cfg->vector;
irte.dest_id = IRTE_DEST(dest);
@@ -3084,16 +3384,10 @@ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
* at the new destination. So, time to cleanup the previous
* vector allocation.
*/
- if (cfg->move_in_progress) {
- cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
- cfg->move_cleanup_count = cpus_weight(cleanup_mask);
- send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
- cfg->move_in_progress = 0;
- }
-
- desc = irq_to_desc(irq);
- desc->affinity = mask;
+ if (cfg->move_in_progress)
+ send_cleanup_vector(cfg);
}
+
#endif
#endif /* CONFIG_SMP */
@@ -3152,7 +3446,7 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
}
#endif
-static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
+static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
{
int ret;
struct msi_msg msg;
@@ -3161,7 +3455,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
if (ret < 0)
return ret;
- set_irq_msi(irq, desc);
+ set_irq_msi(irq, msidesc);
write_msi_msg(irq, &msg);
#ifdef CONFIG_INTR_REMAP
@@ -3181,26 +3475,13 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
return 0;
}
-static unsigned int build_irq_for_pci_dev(struct pci_dev *dev)
-{
- unsigned int irq;
-
- irq = dev->bus->number;
- irq <<= 8;
- irq |= dev->devfn;
- irq <<= 12;
-
- return irq;
-}
-
-int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
+int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc)
{
unsigned int irq;
int ret;
unsigned int irq_want;
- irq_want = build_irq_for_pci_dev(dev) + 0x100;
-
+ irq_want = nr_irqs_gsi;
irq = create_irq_nr(irq_want);
if (irq == 0)
return -1;
@@ -3214,7 +3495,7 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
goto error;
no_ir:
#endif
- ret = setup_msi_irq(dev, desc, irq);
+ ret = setup_msi_irq(dev, msidesc, irq);
if (ret < 0) {
destroy_irq(irq);
return ret;
@@ -3232,7 +3513,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
{
unsigned int irq;
int ret, sub_handle;
- struct msi_desc *desc;
+ struct msi_desc *msidesc;
unsigned int irq_want;
#ifdef CONFIG_INTR_REMAP
@@ -3240,10 +3521,11 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
int index = 0;
#endif
- irq_want = build_irq_for_pci_dev(dev) + 0x100;
+ irq_want = nr_irqs_gsi;
sub_handle = 0;
- list_for_each_entry(desc, &dev->msi_list, list) {
- irq = create_irq_nr(irq_want--);
+ list_for_each_entry(msidesc, &dev->msi_list, list) {
+ irq = create_irq_nr(irq_want);
+ irq_want++;
if (irq == 0)
return -1;
#ifdef CONFIG_INTR_REMAP
@@ -3275,7 +3557,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
}
no_ir:
#endif
- ret = setup_msi_irq(dev, desc, irq);
+ ret = setup_msi_irq(dev, msidesc, irq);
if (ret < 0)
goto error;
sub_handle++;
@@ -3294,24 +3576,18 @@ void arch_teardown_msi_irq(unsigned int irq)
#ifdef CONFIG_DMAR
#ifdef CONFIG_SMP
-static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
+static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
{
+ struct irq_desc *desc = irq_to_desc(irq);
struct irq_cfg *cfg;
struct msi_msg msg;
unsigned int dest;
- cpumask_t tmp;
- struct irq_desc *desc;
- cpus_and(tmp, mask, cpu_online_map);
- if (cpus_empty(tmp))
+ dest = set_desc_affinity(desc, mask);
+ if (dest == BAD_APICID)
return;
- if (assign_irq_vector(irq, mask))
- return;
-
- cfg = irq_cfg(irq);
- cpus_and(tmp, cfg->domain, mask);
- dest = cpu_mask_to_apicid(tmp);
+ cfg = desc->chip_data;
dmar_msi_read(irq, &msg);
@@ -3321,9 +3597,8 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
dmar_msi_write(irq, &msg);
- desc = irq_to_desc(irq);
- desc->affinity = mask;
}
+
#endif /* CONFIG_SMP */
struct irq_chip dmar_msi_type = {
@@ -3355,24 +3630,18 @@ int arch_setup_dmar_msi(unsigned int irq)
#ifdef CONFIG_HPET_TIMER
#ifdef CONFIG_SMP
-static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
+static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
{
+ struct irq_desc *desc = irq_to_desc(irq);
struct irq_cfg *cfg;
- struct irq_desc *desc;
struct msi_msg msg;
unsigned int dest;
- cpumask_t tmp;
- cpus_and(tmp, mask, cpu_online_map);
- if (cpus_empty(tmp))
+ dest = set_desc_affinity(desc, mask);
+ if (dest == BAD_APICID)
return;
- if (assign_irq_vector(irq, mask))
- return;
-
- cfg = irq_cfg(irq);
- cpus_and(tmp, cfg->domain, mask);
- dest = cpu_mask_to_apicid(tmp);
+ cfg = desc->chip_data;
hpet_msi_read(irq, &msg);
@@ -3382,9 +3651,8 @@ static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
hpet_msi_write(irq, &msg);
- desc = irq_to_desc(irq);
- desc->affinity = mask;
}
+
#endif /* CONFIG_SMP */
struct irq_chip hpet_msi_type = {
@@ -3437,28 +3705,21 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
write_ht_irq_msg(irq, &msg);
}
-static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
+static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
{
+ struct irq_desc *desc = irq_to_desc(irq);
struct irq_cfg *cfg;
unsigned int dest;
- cpumask_t tmp;
- struct irq_desc *desc;
-
- cpus_and(tmp, mask, cpu_online_map);
- if (cpus_empty(tmp))
- return;
- if (assign_irq_vector(irq, mask))
+ dest = set_desc_affinity(desc, mask);
+ if (dest == BAD_APICID)
return;
- cfg = irq_cfg(irq);
- cpus_and(tmp, cfg->domain, mask);
- dest = cpu_mask_to_apicid(tmp);
+ cfg = desc->chip_data;
target_ht_irq(irq, dest, cfg->vector);
- desc = irq_to_desc(irq);
- desc->affinity = mask;
}
+
#endif
static struct irq_chip ht_irq_chip = {
@@ -3476,17 +3737,14 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
{
struct irq_cfg *cfg;
int err;
- cpumask_t tmp;
- tmp = TARGET_CPUS;
- err = assign_irq_vector(irq, tmp);
+ cfg = irq_cfg(irq);
+ err = assign_irq_vector(irq, cfg, TARGET_CPUS);
if (!err) {
struct ht_irq_msg msg;
unsigned dest;
- cfg = irq_cfg(irq);
- cpus_and(tmp, cfg->domain, tmp);
- dest = cpu_mask_to_apicid(tmp);
+ dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
@@ -3522,7 +3780,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
unsigned long mmr_offset)
{
- const cpumask_t *eligible_cpu = get_cpu_mask(cpu);
+ const struct cpumask *eligible_cpu = cpumask_of(cpu);
struct irq_cfg *cfg;
int mmr_pnode;
unsigned long mmr_value;
@@ -3530,7 +3788,9 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
unsigned long flags;
int err;
- err = assign_irq_vector(irq, *eligible_cpu);
+ cfg = irq_cfg(irq);
+
+ err = assign_irq_vector(irq, cfg, eligible_cpu);
if (err != 0)
return err;
@@ -3539,8 +3799,6 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
irq_name);
spin_unlock_irqrestore(&vector_lock, flags);
- cfg = irq_cfg(irq);
-
mmr_value = 0;
entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
@@ -3551,7 +3809,7 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
entry->polarity = 0;
entry->trigger = 0;
entry->mask = 0;
- entry->dest = cpu_mask_to_apicid(*eligible_cpu);
+ entry->dest = cpu_mask_to_apicid(eligible_cpu);
mmr_pnode = uv_blade_to_pnode(mmr_blade);
uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
@@ -3592,29 +3850,16 @@ int __init io_apic_get_redir_entries (int ioapic)
return reg_01.bits.entries;
}
-int __init probe_nr_irqs(void)
+void __init probe_nr_irqs_gsi(void)
{
int idx;
int nr = 0;
-#ifndef CONFIG_XEN
- int nr_min = 32;
-#else
- int nr_min = NR_IRQS;
-#endif
for (idx = 0; idx < nr_ioapics; idx++)
nr += io_apic_get_redir_entries(idx) + 1;
- /* double it for hotplug and msi and nmi */
- nr <<= 1;
-
- /* something wrong ? */
- if (nr < nr_min)
- nr = nr_min;
- if (WARN_ON(nr > NR_IRQS))
- nr = NR_IRQS;
-
- return nr;
+ if (nr > nr_irqs_gsi)
+ nr_irqs_gsi = nr;
}
/* --------------------------------------------------------------------------
@@ -3713,19 +3958,31 @@ int __init io_apic_get_version(int ioapic)
int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
{
+ struct irq_desc *desc;
+ struct irq_cfg *cfg;
+ int cpu = boot_cpu_id;
+
if (!IO_APIC_IRQ(irq)) {
apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
ioapic);
return -EINVAL;
}
+ desc = irq_to_desc_alloc_cpu(irq, cpu);
+ if (!desc) {
+ printk(KERN_INFO "can not get irq_desc %d\n", irq);
+ return 0;
+ }
+
/*
* IRQs < 16 are already in the irq_2_pin[] map
*/
- if (irq >= 16)
- add_pin_to_irq(irq, ioapic, pin);
+ if (irq >= NR_IRQS_LEGACY) {
+ cfg = desc->chip_data;
+ add_pin_to_irq_cpu(cfg, cpu, ioapic, pin);
+ }
- setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity);
+ setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
return 0;
}
@@ -3761,7 +4018,9 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
void __init setup_ioapic_dest(void)
{
int pin, ioapic, irq, irq_entry;
+ struct irq_desc *desc;
struct irq_cfg *cfg;
+ const struct cpumask *mask;
if (skip_ioapic_setup == 1)
return;
@@ -3777,17 +4036,31 @@ void __init setup_ioapic_dest(void)
* when you have too many devices, because at that time only boot
* cpu is online.
*/
- cfg = irq_cfg(irq);
- if (!cfg->vector)
- setup_IO_APIC_irq(ioapic, pin, irq,
+ desc = irq_to_desc(irq);
+ cfg = desc->chip_data;
+ if (!cfg->vector) {
+ setup_IO_APIC_irq(ioapic, pin, irq, desc,
irq_trigger(irq_entry),
irq_polarity(irq_entry));
+ continue;
+
+ }
+
+ /*
+ * Honour affinities which have been set in early boot
+ */
+ if (desc->status &
+ (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
+ mask = &desc->affinity;
+ else
+ mask = TARGET_CPUS;
+
#ifdef CONFIG_INTR_REMAP
- else if (intr_remapping_enabled)
- set_ir_ioapic_affinity_irq(irq, TARGET_CPUS);
-#endif
+ if (intr_remapping_enabled)
+ set_ir_ioapic_affinity_irq_desc(desc, mask);
else
- set_ioapic_affinity_irq(irq, TARGET_CPUS);
+#endif
+ set_ioapic_affinity_irq_desc(desc, mask);
}
}
@@ -3836,7 +4109,6 @@ void __init ioapic_init_mappings(void)
struct resource *ioapic_res;
int i;
- irq_2_pin_init();
ioapic_res = ioapic_setup_resources();
for (i = 0; i < nr_ioapics; i++) {
if (smp_found_config) {
diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c
index f1c688e46f3..285bbf8831f 100644
--- a/arch/x86/kernel/ipi.c
+++ b/arch/x86/kernel/ipi.c
@@ -116,18 +116,18 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector)
/*
* This is only used on smaller machines.
*/
-void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
+void send_IPI_mask_bitmask(const struct cpumask *cpumask, int vector)
{
- unsigned long mask = cpus_addr(cpumask)[0];
+ unsigned long mask = cpumask_bits(cpumask)[0];
unsigned long flags;
local_irq_save(flags);
- WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
+ WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
__send_IPI_dest_field(mask, vector);
local_irq_restore(flags);
}
-void send_IPI_mask_sequence(cpumask_t mask, int vector)
+void send_IPI_mask_sequence(const struct cpumask *mask, int vector)
{
unsigned long flags;
unsigned int query_cpu;
@@ -139,12 +139,24 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector)
*/
local_irq_save(flags);
- for_each_possible_cpu(query_cpu) {
- if (cpu_isset(query_cpu, mask)) {
+ for_each_cpu(query_cpu, mask)
+ __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), vector);
+ local_irq_restore(flags);
+}
+
+void send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+{
+ unsigned long flags;
+ unsigned int query_cpu;
+ unsigned int this_cpu = smp_processor_id();
+
+ /* See Hack comment above */
+
+ local_irq_save(flags);
+ for_each_cpu(query_cpu, mask)
+ if (query_cpu != this_cpu)
__send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
vector);
- }
- }
local_irq_restore(flags);
}
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index d1d4dc52f64..3f1d9d18df6 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -118,6 +118,9 @@ int show_interrupts(struct seq_file *p, void *v)
}
desc = irq_to_desc(i);
+ if (!desc)
+ return 0;
+
spin_lock_irqsave(&desc->lock, flags);
#ifndef CONFIG_SMP
any_count = kstat_irqs(i);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index a51382672de..9dc5588f336 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -233,25 +233,28 @@ unsigned int do_IRQ(struct pt_regs *regs)
#ifdef CONFIG_HOTPLUG_CPU
#include <mach_apic.h>
-void fixup_irqs(cpumask_t map)
+/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
+void fixup_irqs(void)
{
unsigned int irq;
static int warned;
struct irq_desc *desc;
for_each_irq_desc(irq, desc) {
- cpumask_t mask;
+ const struct cpumask *affinity;
+ if (!desc)
+ continue;
if (irq == 2)
continue;
- cpus_and(mask, desc->affinity, map);
- if (any_online_cpu(mask) == NR_CPUS) {
+ affinity = &desc->affinity;
+ if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
printk("Breaking affinity for irq %i\n", irq);
- mask = map;
+ affinity = cpu_all_mask;
}
if (desc->chip->set_affinity)
- desc->chip->set_affinity(irq, mask);
+ desc->chip->set_affinity(irq, affinity);
else if (desc->action && !(warned++))
printk("Cannot set affinity for irq %i\n", irq);
}
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 60eb84eb77a..fca2991443f 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -83,40 +83,43 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
}
#ifdef CONFIG_HOTPLUG_CPU
-void fixup_irqs(cpumask_t map)
+/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
+void fixup_irqs(void)
{
unsigned int irq;
static int warned;
struct irq_desc *desc;
for_each_irq_desc(irq, desc) {
- cpumask_t mask;
int break_affinity = 0;
int set_affinity = 1;
+ const struct cpumask *affinity;
+ if (!desc)
+ continue;
if (irq == 2)
continue;
/* interrupt's are disabled at this point */
spin_lock(&desc->lock);
+ affinity = &desc->affinity;
if (!irq_has_action(irq) ||
- cpus_equal(desc->affinity, map)) {
+ cpumask_equal(affinity, cpu_online_mask)) {
spin_unlock(&desc->lock);
continue;
}
- cpus_and(mask, desc->affinity, map);
- if (cpus_empty(mask)) {
+ if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
break_affinity = 1;
- mask = map;
+ affinity = cpu_all_mask;
}
if (desc->chip->mask)
desc->chip->mask(irq);
if (desc->chip->set_affinity)
- desc->chip->set_affinity(irq, mask);
+ desc->chip->set_affinity(irq, affinity);
else if (!(warned++))
set_affinity = 0;
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 845aa9803e8..6a92f47c52e 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -68,8 +68,7 @@ void __init init_ISA_irqs (void)
/*
* 16 old-style INTA-cycle interrupts:
*/
- for (i = 0; i < 16; i++) {
- /* first time call this irq_desc */
+ for (i = 0; i < NR_IRQS_LEGACY; i++) {
struct irq_desc *desc = irq_to_desc(i);
desc->status = IRQ_DISABLED;
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index ff023539128..40c1e62ec78 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -142,8 +142,7 @@ void __init init_ISA_irqs(void)
init_bsp_APIC();
init_8259A(0);
- for (i = 0; i < 16; i++) {
- /* first time call this irq_desc */
+ for (i = 0; i < NR_IRQS_LEGACY; i++) {
struct irq_desc *desc = irq_to_desc(i);
desc->status = IRQ_DISABLED;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 774ac499156..e169ae9b6a6 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -128,7 +128,7 @@ static int kvm_register_clock(char *txt)
}
#ifdef CONFIG_X86_LOCAL_APIC
-static void kvm_setup_secondary_clock(void)
+static void __cpuinit kvm_setup_secondary_clock(void)
{
/*
* Now that the first cpu already had this clocksource initialized,
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 7a385746509..37f420018a4 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -13,6 +13,7 @@
#include <linux/numa.h>
#include <linux/ftrace.h>
#include <linux/suspend.h>
+#include <linux/gfp.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -25,15 +26,6 @@
#include <asm/system.h>
#include <asm/cacheflush.h>
-#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
-static u32 kexec_pgd[1024] PAGE_ALIGNED;
-#ifdef CONFIG_X86_PAE
-static u32 kexec_pmd0[1024] PAGE_ALIGNED;
-static u32 kexec_pmd1[1024] PAGE_ALIGNED;
-#endif
-static u32 kexec_pte0[1024] PAGE_ALIGNED;
-static u32 kexec_pte1[1024] PAGE_ALIGNED;
-
static void set_idt(void *newidt, __u16 limit)
{
struct desc_ptr curidt;
@@ -76,6 +68,76 @@ static void load_segments(void)
#undef __STR
}
+static void machine_kexec_free_page_tables(struct kimage *image)
+{
+ free_page((unsigned long)image->arch.pgd);
+#ifdef CONFIG_X86_PAE
+ free_page((unsigned long)image->arch.pmd0);
+ free_page((unsigned long)image->arch.pmd1);
+#endif
+ free_page((unsigned long)image->arch.pte0);
+ free_page((unsigned long)image->arch.pte1);
+}
+
+static int machine_kexec_alloc_page_tables(struct kimage *image)
+{
+ image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL);
+#ifdef CONFIG_X86_PAE
+ image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
+ image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
+#endif
+ image->arch.pte0 = (pte_t *)get_zeroed_page(GFP_KERNEL);
+ image->arch.pte1 = (pte_t *)get_zeroed_page(GFP_KERNEL);
+ if (!image->arch.pgd ||
+#ifdef CONFIG_X86_PAE
+ !image->arch.pmd0 || !image->arch.pmd1 ||
+#endif
+ !image->arch.pte0 || !image->arch.pte1) {
+ machine_kexec_free_page_tables(image);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+static void machine_kexec_page_table_set_one(
+ pgd_t *pgd, pmd_t *pmd, pte_t *pte,
+ unsigned long vaddr, unsigned long paddr)
+{
+ pud_t *pud;
+
+ pgd += pgd_index(vaddr);
+#ifdef CONFIG_X86_PAE
+ if (!(pgd_val(*pgd) & _PAGE_PRESENT))
+ set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT));
+#endif
+ pud = pud_offset(pgd, vaddr);
+ pmd = pmd_offset(pud, vaddr);
+ if (!(pmd_val(*pmd) & _PAGE_PRESENT))
+ set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
+ pte = pte_offset_kernel(pmd, vaddr);
+ set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
+}
+
+static void machine_kexec_prepare_page_tables(struct kimage *image)
+{
+ void *control_page;
+ pmd_t *pmd = 0;
+
+ control_page = page_address(image->control_code_page);
+#ifdef CONFIG_X86_PAE
+ pmd = image->arch.pmd0;
+#endif
+ machine_kexec_page_table_set_one(
+ image->arch.pgd, pmd, image->arch.pte0,
+ (unsigned long)control_page, __pa(control_page));
+#ifdef CONFIG_X86_PAE
+ pmd = image->arch.pmd1;
+#endif
+ machine_kexec_page_table_set_one(
+ image->arch.pgd, pmd, image->arch.pte1,
+ __pa(control_page), __pa(control_page));
+}
+
/*
* A architecture hook called to validate the
* proposed image and prepare the control pages
@@ -87,12 +149,20 @@ static void load_segments(void)
* reboot code buffer to allow us to avoid allocations
* later.
*
- * Make control page executable.
+ * - Make control page executable.
+ * - Allocate page tables
+ * - Setup page tables
*/
int machine_kexec_prepare(struct kimage *image)
{
+ int error;
+
if (nx_enabled)
set_pages_x(image->control_code_page, 1);
+ error = machine_kexec_alloc_page_tables(image);
+ if (error)
+ return error;
+ machine_kexec_prepare_page_tables(image);
return 0;
}
@@ -104,6 +174,7 @@ void machine_kexec_cleanup(struct kimage *image)
{
if (nx_enabled)
set_pages_nx(image->control_code_page, 1);
+ machine_kexec_free_page_tables(image);
}
/*
@@ -150,18 +221,7 @@ void machine_kexec(struct kimage *image)
relocate_kernel_ptr = control_page;
page_list[PA_CONTROL_PAGE] = __pa(control_page);
page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
- page_list[PA_PGD] = __pa(kexec_pgd);
- page_list[VA_PGD] = (unsigned long)kexec_pgd;
-#ifdef CONFIG_X86_PAE
- page_list[PA_PMD_0] = __pa(kexec_pmd0);
- page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
- page_list[PA_PMD_1] = __pa(kexec_pmd1);
- page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
-#endif
- page_list[PA_PTE_0] = __pa(kexec_pte0);
- page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
- page_list[PA_PTE_1] = __pa(kexec_pte1);
- page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
+ page_list[PA_PGD] = __pa(image->arch.pgd);
if (image->type == KEXEC_TYPE_DEFAULT)
page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 3b599518c32..c12314c9e86 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -287,7 +287,7 @@ static struct clock_event_device mfgpt_clockevent = {
.set_mode = mfgpt_set_mode,
.set_next_event = mfgpt_next_event,
.rating = 250,
- .cpumask = CPU_MASK_ALL,
+ .cpumask = cpu_all_mask,
.shift = 32
};
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index f98f4e1dba0..45e3b69808b 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -586,23 +586,23 @@ static void __init __get_smp_config(unsigned int early)
{
struct intel_mp_floating *mpf = mpf_found;
- if (x86_quirks->mach_get_smp_config) {
- if (x86_quirks->mach_get_smp_config(early))
- return;
- }
+ if (!mpf)
+ return;
+
if (acpi_lapic && early)
return;
+
/*
- * ACPI supports both logical (e.g. Hyper-Threading) and physical
- * processors, where MPS only supports physical.
+ * MPS doesn't support hyperthreading, aka only have
+ * thread 0 apic id in MPS table
*/
- if (acpi_lapic && acpi_ioapic) {
- printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
- "information\n");
+ if (acpi_lapic && acpi_ioapic)
return;
- } else if (acpi_lapic)
- printk(KERN_INFO "Using ACPI for processor (LAPIC) "
- "configuration information\n");
+
+ if (x86_quirks->mach_get_smp_config) {
+ if (x86_quirks->mach_get_smp_config(early))
+ return;
+ }
printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
mpf->mpf_specification);
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index 4caff39078e..0deea37a53c 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -31,7 +31,7 @@
#include <asm/numaq.h>
#include <asm/topology.h>
#include <asm/processor.h>
-#include <asm/mpspec.h>
+#include <asm/genapic.h>
#include <asm/e820.h>
#include <asm/setup.h>
@@ -235,6 +235,13 @@ static int __init numaq_setup_ioapic_ids(void)
return 1;
}
+static int __init numaq_update_genapic(void)
+{
+ genapic->wakeup_cpu = wakeup_secondary_cpu_via_nmi;
+
+ return 0;
+}
+
static struct x86_quirks numaq_x86_quirks __initdata = {
.arch_pre_time_init = numaq_pre_time_init,
.arch_time_init = NULL,
@@ -250,6 +257,7 @@ static struct x86_quirks numaq_x86_quirks __initdata = {
.mpc_oem_pci_bus = mpc_oem_pci_bus,
.smp_read_mpc_oem = smp_read_mpc_oem,
.setup_ioapic_ids = numaq_setup_ioapic_ids,
+ .update_genapic = numaq_update_genapic,
};
void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 0e9f1982b1d..95777b0faa7 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -7,7 +7,8 @@
#include <asm/paravirt.h>
-static void default_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags)
+static inline void
+default_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
{
__raw_spin_lock(lock);
}
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index e1e731d78f3..d28bbdc35e4 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -1567,7 +1567,7 @@ static int __init calgary_parse_options(char *p)
++p;
if (*p == '\0')
break;
- bridge = simple_strtol(p, &endp, 0);
+ bridge = simple_strtoul(p, &endp, 0);
if (p == endp)
break;
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index a42b02b4df6..ba7ad83e20a 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -123,6 +123,8 @@ static void free_iommu(unsigned long offset, int size)
spin_lock_irqsave(&iommu_bitmap_lock, flags);
iommu_area_free(iommu_gart_bitmap, offset, size);
+ if (offset >= next_bit)
+ next_bit = offset + size;
spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index c622772744d..95d811a9594 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -7,7 +7,9 @@
#include <linux/module.h>
#include <linux/pm.h>
#include <linux/clockchips.h>
+#include <linux/ftrace.h>
#include <asm/system.h>
+#include <asm/apic.h>
unsigned long idle_halt;
EXPORT_SYMBOL(idle_halt);
@@ -100,6 +102,9 @@ static inline int hlt_use_halt(void)
void default_idle(void)
{
if (hlt_use_halt()) {
+ struct power_trace it;
+
+ trace_power_start(&it, POWER_CSTATE, 1);
current_thread_info()->status &= ~TS_POLLING;
/*
* TS_POLLING-cleared state must be visible before we
@@ -112,6 +117,7 @@ void default_idle(void)
else
local_irq_enable();
current_thread_info()->status |= TS_POLLING;
+ trace_power_end(&it);
} else {
local_irq_enable();
/* loop is done by the caller */
@@ -122,6 +128,21 @@ void default_idle(void)
EXPORT_SYMBOL(default_idle);
#endif
+void stop_this_cpu(void *dummy)
+{
+ local_irq_disable();
+ /*
+ * Remove this CPU:
+ */
+ cpu_clear(smp_processor_id(), cpu_online_map);
+ disable_local_APIC();
+
+ for (;;) {
+ if (hlt_works(smp_processor_id()))
+ halt();
+ }
+}
+
static void do_nothing(void *unused)
{
}
@@ -154,24 +175,31 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
*/
void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
{
+ struct power_trace it;
+
+ trace_power_start(&it, POWER_CSTATE, (ax>>4)+1);
if (!need_resched()) {
__monitor((void *)&current_thread_info()->flags, 0, 0);
smp_mb();
if (!need_resched())
__mwait(ax, cx);
}
+ trace_power_end(&it);
}
/* Default MONITOR/MWAIT with no hints, used for default C1 state */
static void mwait_idle(void)
{
+ struct power_trace it;
if (!need_resched()) {
+ trace_power_start(&it, POWER_CSTATE, 1);
__monitor((void *)&current_thread_info()->flags, 0, 0);
smp_mb();
if (!need_resched())
__sti_mwait(0, 0);
else
local_irq_enable();
+ trace_power_end(&it);
} else
local_irq_enable();
}
@@ -183,9 +211,13 @@ static void mwait_idle(void)
*/
static void poll_idle(void)
{
+ struct power_trace it;
+
+ trace_power_start(&it, POWER_CSTATE, 0);
local_irq_enable();
while (!need_resched())
cpu_relax();
+ trace_power_end(&it);
}
/*
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 0a1302fe6d4..24c2276aa45 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -38,6 +38,7 @@
#include <linux/percpu.h>
#include <linux/prctl.h>
#include <linux/dmi.h>
+#include <linux/ftrace.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -548,7 +549,8 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
* the task-switch, and shows up in ret_from_fork in entry.S,
* for example.
*/
-struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+__notrace_funcgraph struct task_struct *
+__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread,
*next = &next_p->thread;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c958120fb1b..fbb321d53d3 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -39,6 +39,7 @@
#include <linux/prctl.h>
#include <linux/uaccess.h>
#include <linux/io.h>
+#include <linux/ftrace.h>
#include <asm/pgtable.h>
#include <asm/system.h>
@@ -551,8 +552,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
* - could test fs/gs bitsliced
*
* Kprobes not supported here. Set the probe on schedule instead.
+ * Function graph tracer not supported too.
*/
-struct task_struct *
+__notrace_funcgraph struct task_struct *
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 0a6d8c12e10..2c8ec1ba75e 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -668,14 +668,14 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index,
size_t bts_index, bts_end;
int error;
- error = ds_get_bts_end(child, &bts_end);
+ error = ds_get_bts_end(child->bts, &bts_end);
if (error < 0)
return error;
if (bts_end <= index)
return -EINVAL;
- error = ds_get_bts_index(child, &bts_index);
+ error = ds_get_bts_index(child->bts, &bts_index);
if (error < 0)
return error;
@@ -684,7 +684,7 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index,
if (bts_end <= bts_index)
bts_index -= bts_end;
- error = ds_access_bts(child, bts_index, &bts_record);
+ error = ds_access_bts(child->bts, bts_index, &bts_record);
if (error < 0)
return error;
@@ -705,14 +705,14 @@ static int ptrace_bts_drain(struct task_struct *child,
size_t end, i;
int error;
- error = ds_get_bts_index(child, &end);
+ error = ds_get_bts_index(child->bts, &end);
if (error < 0)
return error;
if (size < (end * sizeof(struct bts_struct)))
return -EIO;
- error = ds_access_bts(child, 0, (const void **)&raw);
+ error = ds_access_bts(child->bts, 0, (const void **)&raw);
if (error < 0)
return error;
@@ -723,18 +723,13 @@ static int ptrace_bts_drain(struct task_struct *child,
return -EFAULT;
}
- error = ds_clear_bts(child);
+ error = ds_clear_bts(child->bts);
if (error < 0)
return error;
return end;
}
-static void ptrace_bts_ovfl(struct task_struct *child)
-{
- send_sig(child->thread.bts_ovfl_signal, child, 0);
-}
-
static int ptrace_bts_config(struct task_struct *child,
long cfg_size,
const struct ptrace_bts_config __user *ucfg)
@@ -760,23 +755,45 @@ static int ptrace_bts_config(struct task_struct *child,
goto errout;
if (cfg.flags & PTRACE_BTS_O_ALLOC) {
- ds_ovfl_callback_t ovfl = NULL;
+ bts_ovfl_callback_t ovfl = NULL;
unsigned int sig = 0;
- /* we ignore the error in case we were not tracing child */
- (void)ds_release_bts(child);
+ error = -EINVAL;
+ if (cfg.size < (10 * bts_cfg.sizeof_bts))
+ goto errout;
if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
if (!cfg.signal)
goto errout;
+ error = -EOPNOTSUPP;
+ goto errout;
+
sig = cfg.signal;
- ovfl = ptrace_bts_ovfl;
}
- error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl);
- if (error < 0)
+ if (child->bts) {
+ (void)ds_release_bts(child->bts);
+ kfree(child->bts_buffer);
+
+ child->bts = NULL;
+ child->bts_buffer = NULL;
+ }
+
+ error = -ENOMEM;
+ child->bts_buffer = kzalloc(cfg.size, GFP_KERNEL);
+ if (!child->bts_buffer)
+ goto errout;
+
+ child->bts = ds_request_bts(child, child->bts_buffer, cfg.size,
+ ovfl, /* th = */ (size_t)-1);
+ if (IS_ERR(child->bts)) {
+ error = PTR_ERR(child->bts);
+ kfree(child->bts_buffer);
+ child->bts = NULL;
+ child->bts_buffer = NULL;
goto errout;
+ }
child->thread.bts_ovfl_signal = sig;
}
@@ -823,15 +840,15 @@ static int ptrace_bts_status(struct task_struct *child,
if (cfg_size < sizeof(cfg))
return -EIO;
- error = ds_get_bts_end(child, &end);
+ error = ds_get_bts_end(child->bts, &end);
if (error < 0)
return error;
- error = ds_access_bts(child, /* index = */ 0, &base);
+ error = ds_access_bts(child->bts, /* index = */ 0, &base);
if (error < 0)
return error;
- error = ds_access_bts(child, /* index = */ end, &max);
+ error = ds_access_bts(child->bts, /* index = */ end, &max);
if (error < 0)
return error;
@@ -884,10 +901,7 @@ static int ptrace_bts_write_record(struct task_struct *child,
return -EINVAL;
}
- /* The writing task will be the switched-to task on a context
- * switch. It needs to write into the switched-from task's BTS
- * buffer. */
- return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts);
+ return ds_write_bts(child->bts, bts_record, bts_cfg.sizeof_bts);
}
void ptrace_bts_take_timestamp(struct task_struct *tsk,
@@ -929,17 +943,16 @@ void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c)
switch (c->x86) {
case 0x6:
switch (c->x86_model) {
+ case 0 ... 0xC:
+ /* sorry, don't know about them */
+ break;
case 0xD:
case 0xE: /* Pentium M */
bts_configure(&bts_cfg_pentium_m);
break;
- case 0xF: /* Core2 */
- case 0x1C: /* Atom */
+ default: /* Core2, Atom, ... */
bts_configure(&bts_cfg_core2);
break;
- default:
- /* sorry, don't know about them */
- break;
}
break;
case 0xF:
@@ -973,13 +986,17 @@ void ptrace_disable(struct task_struct *child)
clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
#endif
#ifdef CONFIG_X86_PTRACE_BTS
- (void)ds_release_bts(child);
+ if (child->bts) {
+ (void)ds_release_bts(child->bts);
+ kfree(child->bts_buffer);
+ child->bts_buffer = NULL;
- child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
- if (!child->thread.debugctlmsr)
- clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+ child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
+ if (!child->thread.debugctlmsr)
+ clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
- clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
+ clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
+ }
#endif /* CONFIG_X86_PTRACE_BTS */
}
@@ -1111,9 +1128,16 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
(child, data, (struct ptrace_bts_config __user *)addr);
break;
- case PTRACE_BTS_SIZE:
- ret = ds_get_bts_index(child, /* pos = */ NULL);
+ case PTRACE_BTS_SIZE: {
+ size_t size;
+
+ ret = ds_get_bts_index(child->bts, &size);
+ if (ret == 0) {
+ BUG_ON(size != (int) size);
+ ret = (int) size;
+ }
break;
+ }
case PTRACE_BTS_GET:
ret = ptrace_bts_read_record
@@ -1121,7 +1145,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
break;
case PTRACE_BTS_CLEAR:
- ret = ds_clear_bts(child);
+ ret = ds_clear_bts(child->bts);
break;
case PTRACE_BTS_DRAIN:
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 724adfc63cb..ba7b9a0e606 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -21,6 +21,9 @@
# include <asm/iommu.h>
#endif
+#include <mach_ipi.h>
+
+
/*
* Power off function, if any
*/
@@ -36,7 +39,10 @@ int reboot_force;
static int reboot_cpu = -1;
#endif
-/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old]
+/* This is set by the PCI code if either type 1 or type 2 PCI is detected */
+bool port_cf9_safe = false;
+
+/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci]
warm Don't set the cold reboot flag
cold Set the cold reboot flag
bios Reboot by jumping through the BIOS (only for X86_32)
@@ -45,6 +51,7 @@ static int reboot_cpu = -1;
kbd Use the keyboard controller. cold reset (default)
acpi Use the RESET_REG in the FADT
efi Use efi reset_system runtime service
+ pci Use the so-called "PCI reset register", CF9
force Avoid anything that could hang.
*/
static int __init reboot_setup(char *str)
@@ -79,6 +86,7 @@ static int __init reboot_setup(char *str)
case 'k':
case 't':
case 'e':
+ case 'p':
reboot_type = *str;
break;
@@ -169,6 +177,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_BOARD_NAME, "0KW626"),
},
},
+ { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */
+ .callback = set_bios_reboot,
+ .ident = "Dell OptiPlex 330",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 330"),
+ DMI_MATCH(DMI_BOARD_NAME, "0KP561"),
+ },
+ },
{ /* Handle problems with rebooting on Dell 2400's */
.callback = set_bios_reboot,
.ident = "Dell PowerEdge 2400",
@@ -395,12 +412,27 @@ static void native_machine_emergency_restart(void)
reboot_type = BOOT_KBD;
break;
-
case BOOT_EFI:
if (efi_enabled)
- efi.reset_system(reboot_mode ? EFI_RESET_WARM : EFI_RESET_COLD,
+ efi.reset_system(reboot_mode ?
+ EFI_RESET_WARM :
+ EFI_RESET_COLD,
EFI_SUCCESS, 0, NULL);
+ reboot_type = BOOT_KBD;
+ break;
+ case BOOT_CF9:
+ port_cf9_safe = true;
+ /* fall through */
+
+ case BOOT_CF9_COND:
+ if (port_cf9_safe) {
+ u8 cf9 = inb(0xcf9) & ~6;
+ outb(cf9|2, 0xcf9); /* Request hard reset */
+ udelay(50);
+ outb(cf9|6, 0xcf9); /* Actually do the reset */
+ udelay(50);
+ }
reboot_type = BOOT_KBD;
break;
}
@@ -461,6 +493,11 @@ static void native_machine_restart(char *__unused)
static void native_machine_halt(void)
{
+ /* stop other cpus and apics */
+ machine_shutdown();
+
+ /* stop this cpu */
+ stop_this_cpu(NULL);
}
static void native_machine_power_off(void)
@@ -514,3 +551,92 @@ void machine_crash_shutdown(struct pt_regs *regs)
machine_ops.crash_shutdown(regs);
}
#endif
+
+
+#if defined(CONFIG_SMP)
+
+/* This keeps a track of which one is crashing cpu. */
+static int crashing_cpu;
+static nmi_shootdown_cb shootdown_callback;
+
+static atomic_t waiting_for_crash_ipi;
+
+static int crash_nmi_callback(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ int cpu;
+
+ if (val != DIE_NMI_IPI)
+ return NOTIFY_OK;
+
+ cpu = raw_smp_processor_id();
+
+ /* Don't do anything if this handler is invoked on crashing cpu.
+ * Otherwise, system will completely hang. Crashing cpu can get
+ * an NMI if system was initially booted with nmi_watchdog parameter.
+ */
+ if (cpu == crashing_cpu)
+ return NOTIFY_STOP;
+ local_irq_disable();
+
+ shootdown_callback(cpu, (struct die_args *)data);
+
+ atomic_dec(&waiting_for_crash_ipi);
+ /* Assume hlt works */
+ halt();
+ for (;;)
+ cpu_relax();
+
+ return 1;
+}
+
+static void smp_send_nmi_allbutself(void)
+{
+ send_IPI_allbutself(NMI_VECTOR);
+}
+
+static struct notifier_block crash_nmi_nb = {
+ .notifier_call = crash_nmi_callback,
+};
+
+/* Halt all other CPUs, calling the specified function on each of them
+ *
+ * This function can be used to halt all other CPUs on crash
+ * or emergency reboot time. The function passed as parameter
+ * will be called inside a NMI handler on all CPUs.
+ */
+void nmi_shootdown_cpus(nmi_shootdown_cb callback)
+{
+ unsigned long msecs;
+ local_irq_disable();
+
+ /* Make a note of crashing cpu. Will be used in NMI callback.*/
+ crashing_cpu = safe_smp_processor_id();
+
+ shootdown_callback = callback;
+
+ atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
+ /* Would it be better to replace the trap vector here? */
+ if (register_die_notifier(&crash_nmi_nb))
+ return; /* return what? */
+ /* Ensure the new callback function is set before sending
+ * out the NMI
+ */
+ wmb();
+
+ smp_send_nmi_allbutself();
+
+ msecs = 1000; /* Wait at most a second for the other cpus to stop */
+ while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
+ mdelay(1);
+ msecs--;
+ }
+
+ /* Leave the nmi callback set */
+}
+#else /* !CONFIG_SMP */
+void nmi_shootdown_cpus(nmi_shootdown_cb callback)
+{
+ /* No other CPUs to shoot down */
+}
+#endif
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index 6f50664b2ba..a160f311972 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -10,15 +10,12 @@
#include <asm/page.h>
#include <asm/kexec.h>
#include <asm/processor-flags.h>
-#include <asm/pgtable.h>
/*
* Must be relocatable PIC code callable as a C function
*/
#define PTR(x) (x << 2)
-#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
-#define PAE_PGD_ATTR (_PAGE_PRESENT)
/* control_page + KEXEC_CONTROL_CODE_MAX_SIZE
* ~ control_page + PAGE_SIZE are used as data storage and stack for
@@ -39,7 +36,6 @@
#define CP_PA_BACKUP_PAGES_MAP DATA(0x1c)
.text
- .align PAGE_SIZE
.globl relocate_kernel
relocate_kernel:
/* Save the CPU context, used for jumping back */
@@ -60,117 +56,6 @@ relocate_kernel:
movl %cr4, %eax
movl %eax, CR4(%edi)
-#ifdef CONFIG_X86_PAE
- /* map the control page at its virtual address */
-
- movl PTR(VA_PGD)(%ebp), %edi
- movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
- andl $0xc0000000, %eax
- shrl $27, %eax
- addl %edi, %eax
-
- movl PTR(PA_PMD_0)(%ebp), %edx
- orl $PAE_PGD_ATTR, %edx
- movl %edx, (%eax)
-
- movl PTR(VA_PMD_0)(%ebp), %edi
- movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
- andl $0x3fe00000, %eax
- shrl $18, %eax
- addl %edi, %eax
-
- movl PTR(PA_PTE_0)(%ebp), %edx
- orl $PAGE_ATTR, %edx
- movl %edx, (%eax)
-
- movl PTR(VA_PTE_0)(%ebp), %edi
- movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
- andl $0x001ff000, %eax
- shrl $9, %eax
- addl %edi, %eax
-
- movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
- orl $PAGE_ATTR, %edx
- movl %edx, (%eax)
-
- /* identity map the control page at its physical address */
-
- movl PTR(VA_PGD)(%ebp), %edi
- movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
- andl $0xc0000000, %eax
- shrl $27, %eax
- addl %edi, %eax
-
- movl PTR(PA_PMD_1)(%ebp), %edx
- orl $PAE_PGD_ATTR, %edx
- movl %edx, (%eax)
-
- movl PTR(VA_PMD_1)(%ebp), %edi
- movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
- andl $0x3fe00000, %eax
- shrl $18, %eax
- addl %edi, %eax
-
- movl PTR(PA_PTE_1)(%ebp), %edx
- orl $PAGE_ATTR, %edx
- movl %edx, (%eax)
-
- movl PTR(VA_PTE_1)(%ebp), %edi
- movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
- andl $0x001ff000, %eax
- shrl $9, %eax
- addl %edi, %eax
-
- movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
- orl $PAGE_ATTR, %edx
- movl %edx, (%eax)
-#else
- /* map the control page at its virtual address */
-
- movl PTR(VA_PGD)(%ebp), %edi
- movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
- andl $0xffc00000, %eax
- shrl $20, %eax
- addl %edi, %eax
-
- movl PTR(PA_PTE_0)(%ebp), %edx
- orl $PAGE_ATTR, %edx
- movl %edx, (%eax)
-
- movl PTR(VA_PTE_0)(%ebp), %edi
- movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
- andl $0x003ff000, %eax
- shrl $10, %eax
- addl %edi, %eax
-
- movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
- orl $PAGE_ATTR, %edx
- movl %edx, (%eax)
-
- /* identity map the control page at its physical address */
-
- movl PTR(VA_PGD)(%ebp), %edi
- movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
- andl $0xffc00000, %eax
- shrl $20, %eax
- addl %edi, %eax
-
- movl PTR(PA_PTE_1)(%ebp), %edx
- orl $PAGE_ATTR, %edx
- movl %edx, (%eax)
-
- movl PTR(VA_PTE_1)(%ebp), %edi
- movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
- andl $0x003ff000, %eax
- shrl $10, %eax
- addl %edi, %eax
-
- movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
- orl $PAGE_ATTR, %edx
- movl %edx, (%eax)
-#endif
-
-relocate_new_kernel:
/* read the arguments and say goodbye to the stack */
movl 20+4(%esp), %ebx /* page_list */
movl 20+8(%esp), %ebp /* list of pages */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 0fa6790c1dd..32fe42f26e7 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -583,7 +583,20 @@ static int __init setup_elfcorehdr(char *arg)
early_param("elfcorehdr", setup_elfcorehdr);
#endif
-static struct x86_quirks default_x86_quirks __initdata;
+static int __init default_update_genapic(void)
+{
+#ifdef CONFIG_X86_SMP
+# if defined(CONFIG_X86_GENERICARCH) || defined(CONFIG_X86_64)
+ genapic->wakeup_cpu = wakeup_secondary_cpu_via_init;
+# endif
+#endif
+
+ return 0;
+}
+
+static struct x86_quirks default_x86_quirks __initdata = {
+ .update_genapic = default_update_genapic,
+};
struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
@@ -764,7 +777,7 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
.callback = dmi_low_memory_corruption,
.ident = "Phoenix BIOS",
.matches = {
- DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"),
+ DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"),
},
},
#endif
@@ -794,6 +807,9 @@ void __init setup_arch(char **cmdline_p)
printk(KERN_INFO "Command line: %s\n", boot_command_line);
#endif
+ /* VMI may relocate the fixmap; do this before touching ioremap area */
+ vmi_init();
+
early_cpu_init();
early_ioremap_init();
@@ -880,13 +896,8 @@ void __init setup_arch(char **cmdline_p)
check_efer();
#endif
-#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
- /*
- * Must be before kernel pagetables are setup
- * or fixmap area is touched.
- */
- vmi_init();
-#endif
+ /* Must be before kernel pagetables are setup */
+ vmi_activate();
/* after early param, so could get panic from serial */
reserve_early_setup_data();
@@ -1082,7 +1093,7 @@ void __init setup_arch(char **cmdline_p)
ioapic_init_mappings();
/* need to wait for io_apic is mapped */
- nr_irqs = probe_nr_irqs();
+ probe_nr_irqs_gsi();
kvm_guest_init();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index ae0c0d3bb77..0b63b08e753 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -152,6 +152,11 @@ void __init setup_per_cpu_areas(void)
old_size = PERCPU_ENOUGH_ROOM;
align = max_t(unsigned long, PAGE_SIZE, align);
size = roundup(old_size, align);
+
+ printk(KERN_INFO
+ "NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
+ NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
+
printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
size);
@@ -168,24 +173,24 @@ void __init setup_per_cpu_areas(void)
"cpu %d has no node %d or node-local memory\n",
cpu, node);
if (ptr)
- printk(KERN_DEBUG "per cpu data for cpu%d at %016lx\n",
+ printk(KERN_DEBUG
+ "per cpu data for cpu%d at %016lx\n",
cpu, __pa(ptr));
}
else {
ptr = __alloc_bootmem_node(NODE_DATA(node), size, align,
__pa(MAX_DMA_ADDRESS));
if (ptr)
- printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n",
- cpu, node, __pa(ptr));
+ printk(KERN_DEBUG
+ "per cpu data for cpu%d on node%d "
+ "at %016lx\n",
+ cpu, node, __pa(ptr));
}
#endif
per_cpu_offset(cpu) = ptr - __per_cpu_start;
memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
}
- printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
- NR_CPUS, nr_cpu_ids, nr_node_ids);
-
/* Setup percpu data maps */
setup_per_cpu_maps();
@@ -282,7 +287,7 @@ static void __cpuinit numa_set_cpumask(int cpu, int enable)
else
cpu_clear(cpu, *mask);
- cpulist_scnprintf(buf, sizeof(buf), *mask);
+ cpulist_scnprintf(buf, sizeof(buf), mask);
printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf);
}
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 18f9b19f5f8..49ed667b06f 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -118,41 +118,28 @@ static void native_smp_send_reschedule(int cpu)
WARN_ON(1);
return;
}
- send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
+ send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
}
void native_send_call_func_single_ipi(int cpu)
{
- send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_SINGLE_VECTOR);
+ send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR);
}
-void native_send_call_func_ipi(cpumask_t mask)
+void native_send_call_func_ipi(const struct cpumask *mask)
{
cpumask_t allbutself;
allbutself = cpu_online_map;
cpu_clear(smp_processor_id(), allbutself);
- if (cpus_equal(mask, allbutself) &&
+ if (cpus_equal(*mask, allbutself) &&
cpus_equal(cpu_online_map, cpu_callout_map))
send_IPI_allbutself(CALL_FUNCTION_VECTOR);
else
send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
}
-static void stop_this_cpu(void *dummy)
-{
- local_irq_disable();
- /*
- * Remove this CPU:
- */
- cpu_clear(smp_processor_id(), cpu_online_map);
- disable_local_APIC();
- if (hlt_works(smp_processor_id()))
- for (;;) halt();
- for (;;);
-}
-
/*
* this function calls the 'stop' function on all other CPUs in the system.
*/
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7b109339731..be946678804 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -62,6 +62,7 @@
#include <asm/mtrr.h>
#include <asm/vmi.h>
#include <asm/genapic.h>
+#include <asm/setup.h>
#include <linux/mc146818rtc.h>
#include <mach_apic.h>
@@ -101,14 +102,8 @@ EXPORT_SYMBOL(smp_num_siblings);
/* Last level cache ID of each logical CPU */
DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID;
-/* bitmap of online cpus */
-cpumask_t cpu_online_map __read_mostly;
-EXPORT_SYMBOL(cpu_online_map);
-
cpumask_t cpu_callin_map;
cpumask_t cpu_callout_map;
-cpumask_t cpu_possible_map;
-EXPORT_SYMBOL(cpu_possible_map);
/* representing HT siblings of each logical CPU */
DEFINE_PER_CPU(cpumask_t, cpu_sibling_map);
@@ -294,9 +289,7 @@ static void __cpuinit start_secondary(void *unused)
* fragile that we want to limit the things done here to the
* most necessary things.
*/
-#ifdef CONFIG_VMI
vmi_bringup();
-#endif
cpu_init();
preempt_disable();
smp_callin();
@@ -536,7 +529,7 @@ static void impress_friends(void)
pr_debug("Before bogocount - setting activated=1.\n");
}
-static inline void __inquire_remote_apic(int apicid)
+void __inquire_remote_apic(int apicid)
{
unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
char *names[] = { "ID", "VERSION", "SPIV" };
@@ -575,14 +568,13 @@ static inline void __inquire_remote_apic(int apicid)
}
}
-#ifdef WAKE_SECONDARY_VIA_NMI
/*
* Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
* INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
* won't ... remember to clear down the APIC, etc later.
*/
-static int __devinit
-wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
+int __devinit
+wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
{
unsigned long send_status, accept_status = 0;
int maxlvt;
@@ -599,7 +591,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
* Give the other CPU some time to accept the IPI.
*/
udelay(200);
- if (APIC_INTEGRATED(apic_version[phys_apicid])) {
+ if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
maxlvt = lapic_get_maxlvt();
if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
@@ -614,11 +606,9 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
return (send_status | accept_status);
}
-#endif /* WAKE_SECONDARY_VIA_NMI */
-#ifdef WAKE_SECONDARY_VIA_INIT
-static int __devinit
-wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
+int __devinit
+wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
{
unsigned long send_status, accept_status = 0;
int maxlvt, num_starts, j;
@@ -737,7 +727,6 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
return (send_status | accept_status);
}
-#endif /* WAKE_SECONDARY_VIA_INIT */
struct create_idle {
struct work_struct work;
@@ -1355,7 +1344,7 @@ void cpu_disable_common(void)
lock_vector_lock();
remove_cpu_from_maps(cpu);
unlock_vector_lock();
- fixup_irqs(cpu_online_map);
+ fixup_irqs();
}
int native_cpu_disable(void)
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index a03e7f6d90c..10786af9554 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -6,6 +6,7 @@
#include <linux/sched.h>
#include <linux/stacktrace.h>
#include <linux/module.h>
+#include <linux/uaccess.h>
#include <asm/stacktrace.h>
static void save_stack_warning(void *data, char *msg)
@@ -83,3 +84,66 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
trace->entries[trace->nr_entries++] = ULONG_MAX;
}
EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
+
+/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */
+
+struct stack_frame {
+ const void __user *next_fp;
+ unsigned long ret_addr;
+};
+
+static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+{
+ int ret;
+
+ if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
+ return 0;
+
+ ret = 1;
+ pagefault_disable();
+ if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
+ ret = 0;
+ pagefault_enable();
+
+ return ret;
+}
+
+static inline void __save_stack_trace_user(struct stack_trace *trace)
+{
+ const struct pt_regs *regs = task_pt_regs(current);
+ const void __user *fp = (const void __user *)regs->bp;
+
+ if (trace->nr_entries < trace->max_entries)
+ trace->entries[trace->nr_entries++] = regs->ip;
+
+ while (trace->nr_entries < trace->max_entries) {
+ struct stack_frame frame;
+
+ frame.next_fp = NULL;
+ frame.ret_addr = 0;
+ if (!copy_stack_frame(fp, &frame))
+ break;
+ if ((unsigned long)fp < regs->sp)
+ break;
+ if (frame.ret_addr) {
+ trace->entries[trace->nr_entries++] =
+ frame.ret_addr;
+ }
+ if (fp == frame.next_fp)
+ break;
+ fp = frame.next_fp;
+ }
+}
+
+void save_stack_trace_user(struct stack_trace *trace)
+{
+ /*
+ * Trace user stack if we are not a kernel thread
+ */
+ if (current->mm) {
+ __save_stack_trace_user(trace);
+ }
+ if (trace->nr_entries < trace->max_entries)
+ trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
+
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
index f4049f3513b..174ea90d1cb 100644
--- a/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@ -164,7 +164,7 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
* We have to send the IPI only to
* CPUs affected.
*/
- send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
+ send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR);
while (!cpus_empty(flush_cpumask))
/* nothing. lockup detection does not belong here */
diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c
index 8f919ca6949..de6f1bda0c5 100644
--- a/arch/x86/kernel/tlb_64.c
+++ b/arch/x86/kernel/tlb_64.c
@@ -191,7 +191,7 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
* We have to send the IPI only to
* CPUs affected.
*/
- send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
+ send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR_START + sender);
while (!cpus_empty(f->flush_cpumask))
cpu_relax();
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 9ffb01c31c4..1c0dfbca87c 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -46,7 +46,9 @@ static __cpuinit void check_tsc_warp(void)
cycles_t start, now, prev, end;
int i;
+ rdtsc_barrier();
start = get_cycles();
+ rdtsc_barrier();
/*
* The measurement runs for 20 msecs:
*/
@@ -61,7 +63,9 @@ static __cpuinit void check_tsc_warp(void)
*/
__raw_spin_lock(&sync_lock);
prev = last_tsc;
+ rdtsc_barrier();
now = get_cycles();
+ rdtsc_barrier();
last_tsc = now;
__raw_spin_unlock(&sync_lock);
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 8b6c393ab9f..22fd6577156 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -960,8 +960,6 @@ static inline int __init activate_vmi(void)
void __init vmi_init(void)
{
- unsigned long flags;
-
if (!vmi_rom)
probe_vmi_rom();
else
@@ -973,13 +971,21 @@ void __init vmi_init(void)
reserve_top_address(-vmi_rom->virtual_top);
- local_irq_save(flags);
- activate_vmi();
-
#ifdef CONFIG_X86_IO_APIC
/* This is virtual hardware; timer routing is wired correctly */
no_timer_check = 1;
#endif
+}
+
+void vmi_activate(void)
+{
+ unsigned long flags;
+
+ if (!vmi_rom)
+ return;
+
+ local_irq_save(flags);
+ activate_vmi();
local_irq_restore(flags & X86_EFLAGS_IF);
}
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 254ee07f863..c4c1f9e0940 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -226,7 +226,7 @@ static void __devinit vmi_time_init_clockevent(void)
/* Upper bound is clockevent's use of ulong for cycle deltas. */
evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
evt->min_delta_ns = clockevent_delta2ns(1, evt);
- evt->cpumask = cpumask_of_cpu(cpu);
+ evt->cpumask = cpumask_of(cpu);
printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n",
evt->name, evt->mult, evt->shift);
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 0b8b6690a86..6f3d3d4cd97 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -17,6 +17,9 @@
* want per guest time just set the kernel.vsyscall64 sysctl to 0.
*/
+/* Disable profiling for userspace code: */
+#define DISABLE_BRANCH_PROFILING
+
#include <linux/time.h>
#include <linux/init.h>
#include <linux/kernel.h>
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index b13acb75e82..15c3e699918 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -310,7 +310,7 @@ static void __init setup_xstate_init(void)
/*
* Enable and initialize the xsave feature.
*/
-void __init xsave_cntxt_init(void)
+void __ref xsave_cntxt_init(void)
{
unsigned int eax, ebx, ecx, edx;